xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64PointerAuth.h"
17 #include "AArch64Subtarget.h"
18 #include "MCTargetDesc/AArch64AddressingModes.h"
19 #include "MCTargetDesc/AArch64MCTargetDesc.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/CFIInstBuilder.h"
25 #include "llvm/CodeGen/LivePhysRegs.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineCombinerPattern.h"
28 #include "llvm/CodeGen/MachineFrameInfo.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineMemOperand.h"
33 #include "llvm/CodeGen/MachineModuleInfo.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/RegisterScavenging.h"
37 #include "llvm/CodeGen/StackMaps.h"
38 #include "llvm/CodeGen/TargetRegisterInfo.h"
39 #include "llvm/CodeGen/TargetSubtargetInfo.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/DebugLoc.h"
42 #include "llvm/IR/GlobalValue.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/MC/MCAsmInfo.h"
45 #include "llvm/MC/MCInst.h"
46 #include "llvm/MC/MCInstBuilder.h"
47 #include "llvm/MC/MCInstrDesc.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CodeGen.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/LEB128.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <utility>
60 
61 using namespace llvm;
62 
63 #define GET_INSTRINFO_CTOR_DTOR
64 #include "AArch64GenInstrInfo.inc"
65 
66 static cl::opt<unsigned>
67     CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
68                        cl::desc("Restrict range of CB instructions (DEBUG)"));
69 
70 static cl::opt<unsigned> TBZDisplacementBits(
71     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
72     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
73 
74 static cl::opt<unsigned> CBZDisplacementBits(
75     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
76     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
77 
78 static cl::opt<unsigned>
79     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
80                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
81 
82 static cl::opt<unsigned>
83     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
84                       cl::desc("Restrict range of B instructions (DEBUG)"));
85 
AArch64InstrInfo(const AArch64Subtarget & STI)86 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
87     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
88                           AArch64::CATCHRET),
89       RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
90 
91 /// GetInstSize - Return the number of bytes of code the specified
92 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const93 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
94   const MachineBasicBlock &MBB = *MI.getParent();
95   const MachineFunction *MF = MBB.getParent();
96   const Function &F = MF->getFunction();
97   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
98 
99   {
100     auto Op = MI.getOpcode();
101     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
102       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
103   }
104 
105   // Meta-instructions emit no code.
106   if (MI.isMetaInstruction())
107     return 0;
108 
109   // FIXME: We currently only handle pseudoinstructions that don't get expanded
110   //        before the assembly printer.
111   unsigned NumBytes = 0;
112   const MCInstrDesc &Desc = MI.getDesc();
113 
114   if (!MI.isBundle() && isTailCallReturnInst(MI)) {
115     NumBytes = Desc.getSize() ? Desc.getSize() : 4;
116 
117     const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
118     if (!MFI->shouldSignReturnAddress(MF))
119       return NumBytes;
120 
121     const auto &STI = MF->getSubtarget<AArch64Subtarget>();
122     auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
123     NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
124     return NumBytes;
125   }
126 
127   // Size should be preferably set in
128   // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
129   // Specific cases handle instructions of variable sizes
130   switch (Desc.getOpcode()) {
131   default:
132     if (Desc.getSize())
133       return Desc.getSize();
134 
135     // Anything not explicitly designated otherwise (i.e. pseudo-instructions
136     // with fixed constant size but not specified in .td file) is a normal
137     // 4-byte insn.
138     NumBytes = 4;
139     break;
140   case TargetOpcode::STACKMAP:
141     // The upper bound for a stackmap intrinsic is the full length of its shadow
142     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
143     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
144     break;
145   case TargetOpcode::PATCHPOINT:
146     // The size of the patchpoint intrinsic is the number of bytes requested
147     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
148     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
149     break;
150   case TargetOpcode::STATEPOINT:
151     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
152     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
153     // No patch bytes means a normal call inst is emitted
154     if (NumBytes == 0)
155       NumBytes = 4;
156     break;
157   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
158     // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
159     // instructions are expanded to the specified number of NOPs. Otherwise,
160     // they are expanded to 36-byte XRay sleds.
161     NumBytes =
162         F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
163     break;
164   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
165   case TargetOpcode::PATCHABLE_TAIL_CALL:
166   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
167     // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
168     NumBytes = 36;
169     break;
170   case TargetOpcode::PATCHABLE_EVENT_CALL:
171     // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
172     NumBytes = 24;
173     break;
174 
175   case AArch64::SPACE:
176     NumBytes = MI.getOperand(1).getImm();
177     break;
178   case TargetOpcode::BUNDLE:
179     NumBytes = getInstBundleLength(MI);
180     break;
181   }
182 
183   return NumBytes;
184 }
185 
getInstBundleLength(const MachineInstr & MI) const186 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
187   unsigned Size = 0;
188   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
189   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
190   while (++I != E && I->isInsideBundle()) {
191     assert(!I->isBundle() && "No nested bundle!");
192     Size += getInstSizeInBytes(*I);
193   }
194   return Size;
195 }
196 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)197 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
198                             SmallVectorImpl<MachineOperand> &Cond) {
199   // Block ends with fall-through condbranch.
200   switch (LastInst->getOpcode()) {
201   default:
202     llvm_unreachable("Unknown branch instruction?");
203   case AArch64::Bcc:
204     Target = LastInst->getOperand(1).getMBB();
205     Cond.push_back(LastInst->getOperand(0));
206     break;
207   case AArch64::CBZW:
208   case AArch64::CBZX:
209   case AArch64::CBNZW:
210   case AArch64::CBNZX:
211     Target = LastInst->getOperand(1).getMBB();
212     Cond.push_back(MachineOperand::CreateImm(-1));
213     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
214     Cond.push_back(LastInst->getOperand(0));
215     break;
216   case AArch64::TBZW:
217   case AArch64::TBZX:
218   case AArch64::TBNZW:
219   case AArch64::TBNZX:
220     Target = LastInst->getOperand(2).getMBB();
221     Cond.push_back(MachineOperand::CreateImm(-1));
222     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
223     Cond.push_back(LastInst->getOperand(0));
224     Cond.push_back(LastInst->getOperand(1));
225     break;
226   case AArch64::CBWPri:
227   case AArch64::CBXPri:
228   case AArch64::CBWPrr:
229   case AArch64::CBXPrr:
230     Target = LastInst->getOperand(3).getMBB();
231     Cond.push_back(MachineOperand::CreateImm(-1));
232     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
233     Cond.push_back(LastInst->getOperand(0));
234     Cond.push_back(LastInst->getOperand(1));
235     Cond.push_back(LastInst->getOperand(2));
236     break;
237   }
238 }
239 
getBranchDisplacementBits(unsigned Opc)240 static unsigned getBranchDisplacementBits(unsigned Opc) {
241   switch (Opc) {
242   default:
243     llvm_unreachable("unexpected opcode!");
244   case AArch64::B:
245     return BDisplacementBits;
246   case AArch64::TBNZW:
247   case AArch64::TBZW:
248   case AArch64::TBNZX:
249   case AArch64::TBZX:
250     return TBZDisplacementBits;
251   case AArch64::CBNZW:
252   case AArch64::CBZW:
253   case AArch64::CBNZX:
254   case AArch64::CBZX:
255     return CBZDisplacementBits;
256   case AArch64::Bcc:
257     return BCCDisplacementBits;
258   case AArch64::CBWPri:
259   case AArch64::CBXPri:
260   case AArch64::CBWPrr:
261   case AArch64::CBXPrr:
262     return CBDisplacementBits;
263   }
264 }
265 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const266 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
267                                              int64_t BrOffset) const {
268   unsigned Bits = getBranchDisplacementBits(BranchOp);
269   assert(Bits >= 3 && "max branch displacement must be enough to jump"
270                       "over conditional branch expansion");
271   return isIntN(Bits, BrOffset / 4);
272 }
273 
274 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const275 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
276   switch (MI.getOpcode()) {
277   default:
278     llvm_unreachable("unexpected opcode!");
279   case AArch64::B:
280     return MI.getOperand(0).getMBB();
281   case AArch64::TBZW:
282   case AArch64::TBNZW:
283   case AArch64::TBZX:
284   case AArch64::TBNZX:
285     return MI.getOperand(2).getMBB();
286   case AArch64::CBZW:
287   case AArch64::CBNZW:
288   case AArch64::CBZX:
289   case AArch64::CBNZX:
290   case AArch64::Bcc:
291     return MI.getOperand(1).getMBB();
292   case AArch64::CBWPri:
293   case AArch64::CBXPri:
294   case AArch64::CBWPrr:
295   case AArch64::CBXPrr:
296     return MI.getOperand(3).getMBB();
297   }
298 }
299 
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const300 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
301                                             MachineBasicBlock &NewDestBB,
302                                             MachineBasicBlock &RestoreBB,
303                                             const DebugLoc &DL,
304                                             int64_t BrOffset,
305                                             RegScavenger *RS) const {
306   assert(RS && "RegScavenger required for long branching");
307   assert(MBB.empty() &&
308          "new block should be inserted for expanding unconditional branch");
309   assert(MBB.pred_size() == 1);
310   assert(RestoreBB.empty() &&
311          "restore block should be inserted for restoring clobbered registers");
312 
313   auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
314     // Offsets outside of the signed 33-bit range are not supported for ADRP +
315     // ADD.
316     if (!isInt<33>(BrOffset))
317       report_fatal_error(
318           "Branch offsets outside of the signed 33-bit range not supported");
319 
320     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
321         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
322     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
323         .addReg(Reg)
324         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
325         .addImm(0);
326     BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
327   };
328 
329   RS->enterBasicBlockEnd(MBB);
330   // If X16 is unused, we can rely on the linker to insert a range extension
331   // thunk if NewDestBB is out of range of a single B instruction.
332   constexpr Register Reg = AArch64::X16;
333   if (!RS->isRegUsed(Reg)) {
334     insertUnconditionalBranch(MBB, &NewDestBB, DL);
335     RS->setRegUsed(Reg);
336     return;
337   }
338 
339   // If there's a free register and it's worth inflating the code size,
340   // manually insert the indirect branch.
341   Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
342   if (Scavenged != AArch64::NoRegister &&
343       MBB.getSectionID() == MBBSectionID::ColdSectionID) {
344     buildIndirectBranch(Scavenged, NewDestBB);
345     RS->setRegUsed(Scavenged);
346     return;
347   }
348 
349   // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
350   // with red zones.
351   AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
352   if (!AFI || AFI->hasRedZone().value_or(true))
353     report_fatal_error(
354         "Unable to insert indirect branch inside function that has red zone");
355 
356   // Otherwise, spill X16 and defer range extension to the linker.
357   BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
358       .addReg(AArch64::SP, RegState::Define)
359       .addReg(Reg)
360       .addReg(AArch64::SP)
361       .addImm(-16);
362 
363   BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
364 
365   BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
366       .addReg(AArch64::SP, RegState::Define)
367       .addReg(Reg, RegState::Define)
368       .addReg(AArch64::SP)
369       .addImm(16);
370 }
371 
372 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const373 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
374                                      MachineBasicBlock *&TBB,
375                                      MachineBasicBlock *&FBB,
376                                      SmallVectorImpl<MachineOperand> &Cond,
377                                      bool AllowModify) const {
378   // If the block has no terminators, it just falls into the block after it.
379   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
380   if (I == MBB.end())
381     return false;
382 
383   // Skip over SpeculationBarrierEndBB terminators
384   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
385       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
386     --I;
387   }
388 
389   if (!isUnpredicatedTerminator(*I))
390     return false;
391 
392   // Get the last instruction in the block.
393   MachineInstr *LastInst = &*I;
394 
395   // If there is only one terminator instruction, process it.
396   unsigned LastOpc = LastInst->getOpcode();
397   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
398     if (isUncondBranchOpcode(LastOpc)) {
399       TBB = LastInst->getOperand(0).getMBB();
400       return false;
401     }
402     if (isCondBranchOpcode(LastOpc)) {
403       // Block ends with fall-through condbranch.
404       parseCondBranch(LastInst, TBB, Cond);
405       return false;
406     }
407     return true; // Can't handle indirect branch.
408   }
409 
410   // Get the instruction before it if it is a terminator.
411   MachineInstr *SecondLastInst = &*I;
412   unsigned SecondLastOpc = SecondLastInst->getOpcode();
413 
414   // If AllowModify is true and the block ends with two or more unconditional
415   // branches, delete all but the first unconditional branch.
416   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
417     while (isUncondBranchOpcode(SecondLastOpc)) {
418       LastInst->eraseFromParent();
419       LastInst = SecondLastInst;
420       LastOpc = LastInst->getOpcode();
421       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
422         // Return now the only terminator is an unconditional branch.
423         TBB = LastInst->getOperand(0).getMBB();
424         return false;
425       }
426       SecondLastInst = &*I;
427       SecondLastOpc = SecondLastInst->getOpcode();
428     }
429   }
430 
431   // If we're allowed to modify and the block ends in a unconditional branch
432   // which could simply fallthrough, remove the branch.  (Note: This case only
433   // matters when we can't understand the whole sequence, otherwise it's also
434   // handled by BranchFolding.cpp.)
435   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
436       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
437     LastInst->eraseFromParent();
438     LastInst = SecondLastInst;
439     LastOpc = LastInst->getOpcode();
440     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
441       assert(!isUncondBranchOpcode(LastOpc) &&
442              "unreachable unconditional branches removed above");
443 
444       if (isCondBranchOpcode(LastOpc)) {
445         // Block ends with fall-through condbranch.
446         parseCondBranch(LastInst, TBB, Cond);
447         return false;
448       }
449       return true; // Can't handle indirect branch.
450     }
451     SecondLastInst = &*I;
452     SecondLastOpc = SecondLastInst->getOpcode();
453   }
454 
455   // If there are three terminators, we don't know what sort of block this is.
456   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
457     return true;
458 
459   // If the block ends with a B and a Bcc, handle it.
460   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
461     parseCondBranch(SecondLastInst, TBB, Cond);
462     FBB = LastInst->getOperand(0).getMBB();
463     return false;
464   }
465 
466   // If the block ends with two unconditional branches, handle it.  The second
467   // one is not executed, so remove it.
468   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
469     TBB = SecondLastInst->getOperand(0).getMBB();
470     I = LastInst;
471     if (AllowModify)
472       I->eraseFromParent();
473     return false;
474   }
475 
476   // ...likewise if it ends with an indirect branch followed by an unconditional
477   // branch.
478   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
479     I = LastInst;
480     if (AllowModify)
481       I->eraseFromParent();
482     return true;
483   }
484 
485   // Otherwise, can't handle this.
486   return true;
487 }
488 
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const489 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
490                                               MachineBranchPredicate &MBP,
491                                               bool AllowModify) const {
492   // For the moment, handle only a block which ends with a cb(n)zx followed by
493   // a fallthrough.  Why this?  Because it is a common form.
494   // TODO: Should we handle b.cc?
495 
496   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
497   if (I == MBB.end())
498     return true;
499 
500   // Skip over SpeculationBarrierEndBB terminators
501   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
502       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
503     --I;
504   }
505 
506   if (!isUnpredicatedTerminator(*I))
507     return true;
508 
509   // Get the last instruction in the block.
510   MachineInstr *LastInst = &*I;
511   unsigned LastOpc = LastInst->getOpcode();
512   if (!isCondBranchOpcode(LastOpc))
513     return true;
514 
515   switch (LastOpc) {
516   default:
517     return true;
518   case AArch64::CBZW:
519   case AArch64::CBZX:
520   case AArch64::CBNZW:
521   case AArch64::CBNZX:
522     break;
523   };
524 
525   MBP.TrueDest = LastInst->getOperand(1).getMBB();
526   assert(MBP.TrueDest && "expected!");
527   MBP.FalseDest = MBB.getNextNode();
528 
529   MBP.ConditionDef = nullptr;
530   MBP.SingleUseCondition = false;
531 
532   MBP.LHS = LastInst->getOperand(0);
533   MBP.RHS = MachineOperand::CreateImm(0);
534   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
535                                             : MachineBranchPredicate::PRED_EQ;
536   return false;
537 }
538 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const539 bool AArch64InstrInfo::reverseBranchCondition(
540     SmallVectorImpl<MachineOperand> &Cond) const {
541   if (Cond[0].getImm() != -1) {
542     // Regular Bcc
543     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
544     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
545   } else {
546     // Folded compare-and-branch
547     switch (Cond[1].getImm()) {
548     default:
549       llvm_unreachable("Unknown conditional branch!");
550     case AArch64::CBZW:
551       Cond[1].setImm(AArch64::CBNZW);
552       break;
553     case AArch64::CBNZW:
554       Cond[1].setImm(AArch64::CBZW);
555       break;
556     case AArch64::CBZX:
557       Cond[1].setImm(AArch64::CBNZX);
558       break;
559     case AArch64::CBNZX:
560       Cond[1].setImm(AArch64::CBZX);
561       break;
562     case AArch64::TBZW:
563       Cond[1].setImm(AArch64::TBNZW);
564       break;
565     case AArch64::TBNZW:
566       Cond[1].setImm(AArch64::TBZW);
567       break;
568     case AArch64::TBZX:
569       Cond[1].setImm(AArch64::TBNZX);
570       break;
571     case AArch64::TBNZX:
572       Cond[1].setImm(AArch64::TBZX);
573       break;
574 
575     // Cond is { -1, Opcode, CC, Op0, Op1 }
576     case AArch64::CBWPri:
577     case AArch64::CBXPri:
578     case AArch64::CBWPrr:
579     case AArch64::CBXPrr: {
580       // Pseudos using standard 4bit Arm condition codes
581       AArch64CC::CondCode CC =
582           static_cast<AArch64CC::CondCode>(Cond[2].getImm());
583       Cond[2].setImm(AArch64CC::getInvertedCondCode(CC));
584     }
585     }
586   }
587 
588   return false;
589 }
590 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const591 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
592                                         int *BytesRemoved) const {
593   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
594   if (I == MBB.end())
595     return 0;
596 
597   if (!isUncondBranchOpcode(I->getOpcode()) &&
598       !isCondBranchOpcode(I->getOpcode()))
599     return 0;
600 
601   // Remove the branch.
602   I->eraseFromParent();
603 
604   I = MBB.end();
605 
606   if (I == MBB.begin()) {
607     if (BytesRemoved)
608       *BytesRemoved = 4;
609     return 1;
610   }
611   --I;
612   if (!isCondBranchOpcode(I->getOpcode())) {
613     if (BytesRemoved)
614       *BytesRemoved = 4;
615     return 1;
616   }
617 
618   // Remove the branch.
619   I->eraseFromParent();
620   if (BytesRemoved)
621     *BytesRemoved = 8;
622 
623   return 2;
624 }
625 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const626 void AArch64InstrInfo::instantiateCondBranch(
627     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
628     ArrayRef<MachineOperand> Cond) const {
629   if (Cond[0].getImm() != -1) {
630     // Regular Bcc
631     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
632   } else {
633     // Folded compare-and-branch
634     // Note that we use addOperand instead of addReg to keep the flags.
635 
636     // cbz, cbnz
637     const MachineInstrBuilder MIB =
638         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
639 
640     // tbz/tbnz
641     if (Cond.size() > 3)
642       MIB.add(Cond[3]);
643 
644     // cb
645     if (Cond.size() > 4)
646       MIB.add(Cond[4]);
647 
648     MIB.addMBB(TBB);
649   }
650 }
651 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const652 unsigned AArch64InstrInfo::insertBranch(
653     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
654     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
655   // Shouldn't be a fall through.
656   assert(TBB && "insertBranch must not be told to insert a fallthrough");
657 
658   if (!FBB) {
659     if (Cond.empty()) // Unconditional branch?
660       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
661     else
662       instantiateCondBranch(MBB, DL, TBB, Cond);
663 
664     if (BytesAdded)
665       *BytesAdded = 4;
666 
667     return 1;
668   }
669 
670   // Two-way conditional branch.
671   instantiateCondBranch(MBB, DL, TBB, Cond);
672   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
673 
674   if (BytesAdded)
675     *BytesAdded = 8;
676 
677   return 2;
678 }
679 
680 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)681 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
682   while (Register::isVirtualRegister(VReg)) {
683     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
684     if (!DefMI->isFullCopy())
685       return VReg;
686     VReg = DefMI->getOperand(1).getReg();
687   }
688   return VReg;
689 }
690 
691 // Determine if VReg is defined by an instruction that can be folded into a
692 // csel instruction. If so, return the folded opcode, and the replacement
693 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)694 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
695                                 unsigned *NewVReg = nullptr) {
696   VReg = removeCopies(MRI, VReg);
697   if (!Register::isVirtualRegister(VReg))
698     return 0;
699 
700   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
701   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
702   unsigned Opc = 0;
703   unsigned SrcOpNum = 0;
704   switch (DefMI->getOpcode()) {
705   case AArch64::ADDSXri:
706   case AArch64::ADDSWri:
707     // if NZCV is used, do not fold.
708     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
709                                          true) == -1)
710       return 0;
711     // fall-through to ADDXri and ADDWri.
712     [[fallthrough]];
713   case AArch64::ADDXri:
714   case AArch64::ADDWri:
715     // add x, 1 -> csinc.
716     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
717         DefMI->getOperand(3).getImm() != 0)
718       return 0;
719     SrcOpNum = 1;
720     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
721     break;
722 
723   case AArch64::ORNXrr:
724   case AArch64::ORNWrr: {
725     // not x -> csinv, represented as orn dst, xzr, src.
726     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
727     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
728       return 0;
729     SrcOpNum = 2;
730     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
731     break;
732   }
733 
734   case AArch64::SUBSXrr:
735   case AArch64::SUBSWrr:
736     // if NZCV is used, do not fold.
737     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
738                                          true) == -1)
739       return 0;
740     // fall-through to SUBXrr and SUBWrr.
741     [[fallthrough]];
742   case AArch64::SUBXrr:
743   case AArch64::SUBWrr: {
744     // neg x -> csneg, represented as sub dst, xzr, src.
745     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
746     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
747       return 0;
748     SrcOpNum = 2;
749     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
750     break;
751   }
752   default:
753     return 0;
754   }
755   assert(Opc && SrcOpNum && "Missing parameters");
756 
757   if (NewVReg)
758     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
759   return Opc;
760 }
761 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const762 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
763                                        ArrayRef<MachineOperand> Cond,
764                                        Register DstReg, Register TrueReg,
765                                        Register FalseReg, int &CondCycles,
766                                        int &TrueCycles,
767                                        int &FalseCycles) const {
768   // Check register classes.
769   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
770   const TargetRegisterClass *RC =
771       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
772   if (!RC)
773     return false;
774 
775   // Also need to check the dest regclass, in case we're trying to optimize
776   // something like:
777   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
778   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
779     return false;
780 
781   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
782   unsigned ExtraCondLat = Cond.size() != 1;
783 
784   // GPRs are handled by csel.
785   // FIXME: Fold in x+1, -x, and ~x when applicable.
786   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
787       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
788     // Single-cycle csel, csinc, csinv, and csneg.
789     CondCycles = 1 + ExtraCondLat;
790     TrueCycles = FalseCycles = 1;
791     if (canFoldIntoCSel(MRI, TrueReg))
792       TrueCycles = 0;
793     else if (canFoldIntoCSel(MRI, FalseReg))
794       FalseCycles = 0;
795     return true;
796   }
797 
798   // Scalar floating point is handled by fcsel.
799   // FIXME: Form fabs, fmin, and fmax when applicable.
800   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
801       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
802     CondCycles = 5 + ExtraCondLat;
803     TrueCycles = FalseCycles = 2;
804     return true;
805   }
806 
807   // Can't do vectors.
808   return false;
809 }
810 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const811 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
812                                     MachineBasicBlock::iterator I,
813                                     const DebugLoc &DL, Register DstReg,
814                                     ArrayRef<MachineOperand> Cond,
815                                     Register TrueReg, Register FalseReg) const {
816   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
817 
818   // Parse the condition code, see parseCondBranch() above.
819   AArch64CC::CondCode CC;
820   switch (Cond.size()) {
821   default:
822     llvm_unreachable("Unknown condition opcode in Cond");
823   case 1: // b.cc
824     CC = AArch64CC::CondCode(Cond[0].getImm());
825     break;
826   case 3: { // cbz/cbnz
827     // We must insert a compare against 0.
828     bool Is64Bit;
829     switch (Cond[1].getImm()) {
830     default:
831       llvm_unreachable("Unknown branch opcode in Cond");
832     case AArch64::CBZW:
833       Is64Bit = false;
834       CC = AArch64CC::EQ;
835       break;
836     case AArch64::CBZX:
837       Is64Bit = true;
838       CC = AArch64CC::EQ;
839       break;
840     case AArch64::CBNZW:
841       Is64Bit = false;
842       CC = AArch64CC::NE;
843       break;
844     case AArch64::CBNZX:
845       Is64Bit = true;
846       CC = AArch64CC::NE;
847       break;
848     }
849     Register SrcReg = Cond[2].getReg();
850     if (Is64Bit) {
851       // cmp reg, #0 is actually subs xzr, reg, #0.
852       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
853       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
854           .addReg(SrcReg)
855           .addImm(0)
856           .addImm(0);
857     } else {
858       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
859       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
860           .addReg(SrcReg)
861           .addImm(0)
862           .addImm(0);
863     }
864     break;
865   }
866   case 4: { // tbz/tbnz
867     // We must insert a tst instruction.
868     switch (Cond[1].getImm()) {
869     default:
870       llvm_unreachable("Unknown branch opcode in Cond");
871     case AArch64::TBZW:
872     case AArch64::TBZX:
873       CC = AArch64CC::EQ;
874       break;
875     case AArch64::TBNZW:
876     case AArch64::TBNZX:
877       CC = AArch64CC::NE;
878       break;
879     }
880     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
881     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
882       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
883           .addReg(Cond[2].getReg())
884           .addImm(
885               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
886     else
887       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
888           .addReg(Cond[2].getReg())
889           .addImm(
890               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
891     break;
892   }
893   case 5: { // cb
894     // We must insert a cmp, that is a subs
895     //            0       1   2    3    4
896     // Cond is { -1, Opcode, CC, Op0, Op1 }
897     unsigned SUBSOpC, SUBSDestReg;
898     bool IsImm = false;
899     CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
900     switch (Cond[1].getImm()) {
901     default:
902       llvm_unreachable("Unknown branch opcode in Cond");
903     case AArch64::CBWPri:
904       SUBSOpC = AArch64::SUBSWri;
905       SUBSDestReg = AArch64::WZR;
906       IsImm = true;
907       break;
908     case AArch64::CBXPri:
909       SUBSOpC = AArch64::SUBSXri;
910       SUBSDestReg = AArch64::XZR;
911       IsImm = true;
912       break;
913     case AArch64::CBWPrr:
914       SUBSOpC = AArch64::SUBSWrr;
915       SUBSDestReg = AArch64::WZR;
916       IsImm = false;
917       break;
918     case AArch64::CBXPrr:
919       SUBSOpC = AArch64::SUBSXrr;
920       SUBSDestReg = AArch64::XZR;
921       IsImm = false;
922       break;
923     }
924 
925     if (IsImm)
926       BuildMI(MBB, I, DL, get(SUBSOpC), SUBSDestReg)
927           .addReg(Cond[3].getReg())
928           .addImm(Cond[4].getImm())
929           .addImm(0);
930     else
931       BuildMI(MBB, I, DL, get(SUBSOpC), SUBSDestReg)
932           .addReg(Cond[3].getReg())
933           .addReg(Cond[4].getReg());
934   }
935   }
936 
937   unsigned Opc = 0;
938   const TargetRegisterClass *RC = nullptr;
939   bool TryFold = false;
940   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
941     RC = &AArch64::GPR64RegClass;
942     Opc = AArch64::CSELXr;
943     TryFold = true;
944   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
945     RC = &AArch64::GPR32RegClass;
946     Opc = AArch64::CSELWr;
947     TryFold = true;
948   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
949     RC = &AArch64::FPR64RegClass;
950     Opc = AArch64::FCSELDrrr;
951   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
952     RC = &AArch64::FPR32RegClass;
953     Opc = AArch64::FCSELSrrr;
954   }
955   assert(RC && "Unsupported regclass");
956 
957   // Try folding simple instructions into the csel.
958   if (TryFold) {
959     unsigned NewVReg = 0;
960     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
961     if (FoldedOpc) {
962       // The folded opcodes csinc, csinc and csneg apply the operation to
963       // FalseReg, so we need to invert the condition.
964       CC = AArch64CC::getInvertedCondCode(CC);
965       TrueReg = FalseReg;
966     } else
967       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
968 
969     // Fold the operation. Leave any dead instructions for DCE to clean up.
970     if (FoldedOpc) {
971       FalseReg = NewVReg;
972       Opc = FoldedOpc;
973       // The extends the live range of NewVReg.
974       MRI.clearKillFlags(NewVReg);
975     }
976   }
977 
978   // Pull all virtual register into the appropriate class.
979   MRI.constrainRegClass(TrueReg, RC);
980   MRI.constrainRegClass(FalseReg, RC);
981 
982   // Insert the csel.
983   BuildMI(MBB, I, DL, get(Opc), DstReg)
984       .addReg(TrueReg)
985       .addReg(FalseReg)
986       .addImm(CC);
987 }
988 
989 // Return true if Imm can be loaded into a register by a "cheap" sequence of
990 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)991 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
992   if (BitSize == 32)
993     return true;
994 
995   assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
996   uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
997   SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
998   AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
999 
1000   return Is.size() <= 2;
1001 }
1002 
1003 // FIXME: this implementation should be micro-architecture dependent, so a
1004 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const1005 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1006   if (Subtarget.hasExynosCheapAsMoveHandling()) {
1007     if (isExynosCheapAsMove(MI))
1008       return true;
1009     return MI.isAsCheapAsAMove();
1010   }
1011 
1012   switch (MI.getOpcode()) {
1013   default:
1014     return MI.isAsCheapAsAMove();
1015 
1016   case AArch64::ADDWrs:
1017   case AArch64::ADDXrs:
1018   case AArch64::SUBWrs:
1019   case AArch64::SUBXrs:
1020     return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1021 
1022   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1023   // ORRXri, it is as cheap as MOV.
1024   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1025   case AArch64::MOVi32imm:
1026     return isCheapImmediate(MI, 32);
1027   case AArch64::MOVi64imm:
1028     return isCheapImmediate(MI, 64);
1029   }
1030 }
1031 
isFalkorShiftExtFast(const MachineInstr & MI)1032 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1033   switch (MI.getOpcode()) {
1034   default:
1035     return false;
1036 
1037   case AArch64::ADDWrs:
1038   case AArch64::ADDXrs:
1039   case AArch64::ADDSWrs:
1040   case AArch64::ADDSXrs: {
1041     unsigned Imm = MI.getOperand(3).getImm();
1042     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1043     if (ShiftVal == 0)
1044       return true;
1045     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1046   }
1047 
1048   case AArch64::ADDWrx:
1049   case AArch64::ADDXrx:
1050   case AArch64::ADDXrx64:
1051   case AArch64::ADDSWrx:
1052   case AArch64::ADDSXrx:
1053   case AArch64::ADDSXrx64: {
1054     unsigned Imm = MI.getOperand(3).getImm();
1055     switch (AArch64_AM::getArithExtendType(Imm)) {
1056     default:
1057       return false;
1058     case AArch64_AM::UXTB:
1059     case AArch64_AM::UXTH:
1060     case AArch64_AM::UXTW:
1061     case AArch64_AM::UXTX:
1062       return AArch64_AM::getArithShiftValue(Imm) <= 4;
1063     }
1064   }
1065 
1066   case AArch64::SUBWrs:
1067   case AArch64::SUBSWrs: {
1068     unsigned Imm = MI.getOperand(3).getImm();
1069     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1070     return ShiftVal == 0 ||
1071            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1072   }
1073 
1074   case AArch64::SUBXrs:
1075   case AArch64::SUBSXrs: {
1076     unsigned Imm = MI.getOperand(3).getImm();
1077     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1078     return ShiftVal == 0 ||
1079            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1080   }
1081 
1082   case AArch64::SUBWrx:
1083   case AArch64::SUBXrx:
1084   case AArch64::SUBXrx64:
1085   case AArch64::SUBSWrx:
1086   case AArch64::SUBSXrx:
1087   case AArch64::SUBSXrx64: {
1088     unsigned Imm = MI.getOperand(3).getImm();
1089     switch (AArch64_AM::getArithExtendType(Imm)) {
1090     default:
1091       return false;
1092     case AArch64_AM::UXTB:
1093     case AArch64_AM::UXTH:
1094     case AArch64_AM::UXTW:
1095     case AArch64_AM::UXTX:
1096       return AArch64_AM::getArithShiftValue(Imm) == 0;
1097     }
1098   }
1099 
1100   case AArch64::LDRBBroW:
1101   case AArch64::LDRBBroX:
1102   case AArch64::LDRBroW:
1103   case AArch64::LDRBroX:
1104   case AArch64::LDRDroW:
1105   case AArch64::LDRDroX:
1106   case AArch64::LDRHHroW:
1107   case AArch64::LDRHHroX:
1108   case AArch64::LDRHroW:
1109   case AArch64::LDRHroX:
1110   case AArch64::LDRQroW:
1111   case AArch64::LDRQroX:
1112   case AArch64::LDRSBWroW:
1113   case AArch64::LDRSBWroX:
1114   case AArch64::LDRSBXroW:
1115   case AArch64::LDRSBXroX:
1116   case AArch64::LDRSHWroW:
1117   case AArch64::LDRSHWroX:
1118   case AArch64::LDRSHXroW:
1119   case AArch64::LDRSHXroX:
1120   case AArch64::LDRSWroW:
1121   case AArch64::LDRSWroX:
1122   case AArch64::LDRSroW:
1123   case AArch64::LDRSroX:
1124   case AArch64::LDRWroW:
1125   case AArch64::LDRWroX:
1126   case AArch64::LDRXroW:
1127   case AArch64::LDRXroX:
1128   case AArch64::PRFMroW:
1129   case AArch64::PRFMroX:
1130   case AArch64::STRBBroW:
1131   case AArch64::STRBBroX:
1132   case AArch64::STRBroW:
1133   case AArch64::STRBroX:
1134   case AArch64::STRDroW:
1135   case AArch64::STRDroX:
1136   case AArch64::STRHHroW:
1137   case AArch64::STRHHroX:
1138   case AArch64::STRHroW:
1139   case AArch64::STRHroX:
1140   case AArch64::STRQroW:
1141   case AArch64::STRQroX:
1142   case AArch64::STRSroW:
1143   case AArch64::STRSroX:
1144   case AArch64::STRWroW:
1145   case AArch64::STRWroX:
1146   case AArch64::STRXroW:
1147   case AArch64::STRXroX: {
1148     unsigned IsSigned = MI.getOperand(3).getImm();
1149     return !IsSigned;
1150   }
1151   }
1152 }
1153 
isSEHInstruction(const MachineInstr & MI)1154 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1155   unsigned Opc = MI.getOpcode();
1156   switch (Opc) {
1157     default:
1158       return false;
1159     case AArch64::SEH_StackAlloc:
1160     case AArch64::SEH_SaveFPLR:
1161     case AArch64::SEH_SaveFPLR_X:
1162     case AArch64::SEH_SaveReg:
1163     case AArch64::SEH_SaveReg_X:
1164     case AArch64::SEH_SaveRegP:
1165     case AArch64::SEH_SaveRegP_X:
1166     case AArch64::SEH_SaveFReg:
1167     case AArch64::SEH_SaveFReg_X:
1168     case AArch64::SEH_SaveFRegP:
1169     case AArch64::SEH_SaveFRegP_X:
1170     case AArch64::SEH_SetFP:
1171     case AArch64::SEH_AddFP:
1172     case AArch64::SEH_Nop:
1173     case AArch64::SEH_PrologEnd:
1174     case AArch64::SEH_EpilogStart:
1175     case AArch64::SEH_EpilogEnd:
1176     case AArch64::SEH_PACSignLR:
1177     case AArch64::SEH_SaveAnyRegQP:
1178     case AArch64::SEH_SaveAnyRegQPX:
1179     case AArch64::SEH_AllocZ:
1180     case AArch64::SEH_SaveZReg:
1181     case AArch64::SEH_SavePReg:
1182       return true;
1183   }
1184 }
1185 
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1186 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1187                                              Register &SrcReg, Register &DstReg,
1188                                              unsigned &SubIdx) const {
1189   switch (MI.getOpcode()) {
1190   default:
1191     return false;
1192   case AArch64::SBFMXri: // aka sxtw
1193   case AArch64::UBFMXri: // aka uxtw
1194     // Check for the 32 -> 64 bit extension case, these instructions can do
1195     // much more.
1196     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1197       return false;
1198     // This is a signed or unsigned 32 -> 64 bit extension.
1199     SrcReg = MI.getOperand(1).getReg();
1200     DstReg = MI.getOperand(0).getReg();
1201     SubIdx = AArch64::sub_32;
1202     return true;
1203   }
1204 }
1205 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1206 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1207     const MachineInstr &MIa, const MachineInstr &MIb) const {
1208   const TargetRegisterInfo *TRI = &getRegisterInfo();
1209   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1210   int64_t OffsetA = 0, OffsetB = 0;
1211   TypeSize WidthA(0, false), WidthB(0, false);
1212   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1213 
1214   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1215   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1216 
1217   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1218       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1219     return false;
1220 
1221   // Retrieve the base, offset from the base and width. Width
1222   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1223   // base are identical, and the offset of a lower memory access +
1224   // the width doesn't overlap the offset of a higher memory access,
1225   // then the memory accesses are different.
1226   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1227   // are assumed to have the same scale (vscale).
1228   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1229                                    WidthA, TRI) &&
1230       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1231                                    WidthB, TRI)) {
1232     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1233         OffsetAIsScalable == OffsetBIsScalable) {
1234       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1235       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1236       TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1237       if (LowWidth.isScalable() == OffsetAIsScalable &&
1238           LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1239         return true;
1240     }
1241   }
1242   return false;
1243 }
1244 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1245 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1246                                             const MachineBasicBlock *MBB,
1247                                             const MachineFunction &MF) const {
1248   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1249     return true;
1250 
1251   // Do not move an instruction that can be recognized as a branch target.
1252   if (hasBTISemantics(MI))
1253     return true;
1254 
1255   switch (MI.getOpcode()) {
1256   case AArch64::HINT:
1257     // CSDB hints are scheduling barriers.
1258     if (MI.getOperand(0).getImm() == 0x14)
1259       return true;
1260     break;
1261   case AArch64::DSB:
1262   case AArch64::ISB:
1263     // DSB and ISB also are scheduling barriers.
1264     return true;
1265   case AArch64::MSRpstatesvcrImm1:
1266     // SMSTART and SMSTOP are also scheduling barriers.
1267     return true;
1268   default:;
1269   }
1270   if (isSEHInstruction(MI))
1271     return true;
1272   auto Next = std::next(MI.getIterator());
1273   return Next != MBB->end() && Next->isCFIInstruction();
1274 }
1275 
1276 /// analyzeCompare - For a comparison instruction, return the source registers
1277 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1278 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1279 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1280                                       Register &SrcReg2, int64_t &CmpMask,
1281                                       int64_t &CmpValue) const {
1282   // The first operand can be a frame index where we'd normally expect a
1283   // register.
1284   // FIXME: Pass subregisters out of analyzeCompare
1285   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1286   if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1287     return false;
1288 
1289   switch (MI.getOpcode()) {
1290   default:
1291     break;
1292   case AArch64::PTEST_PP:
1293   case AArch64::PTEST_PP_ANY:
1294     SrcReg = MI.getOperand(0).getReg();
1295     SrcReg2 = MI.getOperand(1).getReg();
1296     if (MI.getOperand(2).getSubReg())
1297       return false;
1298 
1299     // Not sure about the mask and value for now...
1300     CmpMask = ~0;
1301     CmpValue = 0;
1302     return true;
1303   case AArch64::SUBSWrr:
1304   case AArch64::SUBSWrs:
1305   case AArch64::SUBSWrx:
1306   case AArch64::SUBSXrr:
1307   case AArch64::SUBSXrs:
1308   case AArch64::SUBSXrx:
1309   case AArch64::ADDSWrr:
1310   case AArch64::ADDSWrs:
1311   case AArch64::ADDSWrx:
1312   case AArch64::ADDSXrr:
1313   case AArch64::ADDSXrs:
1314   case AArch64::ADDSXrx:
1315     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1316     SrcReg = MI.getOperand(1).getReg();
1317     SrcReg2 = MI.getOperand(2).getReg();
1318 
1319     // FIXME: Pass subregisters out of analyzeCompare
1320     if (MI.getOperand(2).getSubReg())
1321       return false;
1322 
1323     CmpMask = ~0;
1324     CmpValue = 0;
1325     return true;
1326   case AArch64::SUBSWri:
1327   case AArch64::ADDSWri:
1328   case AArch64::SUBSXri:
1329   case AArch64::ADDSXri:
1330     SrcReg = MI.getOperand(1).getReg();
1331     SrcReg2 = 0;
1332     CmpMask = ~0;
1333     CmpValue = MI.getOperand(2).getImm();
1334     return true;
1335   case AArch64::ANDSWri:
1336   case AArch64::ANDSXri:
1337     // ANDS does not use the same encoding scheme as the others xxxS
1338     // instructions.
1339     SrcReg = MI.getOperand(1).getReg();
1340     SrcReg2 = 0;
1341     CmpMask = ~0;
1342     CmpValue = AArch64_AM::decodeLogicalImmediate(
1343                    MI.getOperand(2).getImm(),
1344                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1345     return true;
1346   }
1347 
1348   return false;
1349 }
1350 
UpdateOperandRegClass(MachineInstr & Instr)1351 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1352   MachineBasicBlock *MBB = Instr.getParent();
1353   assert(MBB && "Can't get MachineBasicBlock here");
1354   MachineFunction *MF = MBB->getParent();
1355   assert(MF && "Can't get MachineFunction here");
1356   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1357   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1358   MachineRegisterInfo *MRI = &MF->getRegInfo();
1359 
1360   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1361        ++OpIdx) {
1362     MachineOperand &MO = Instr.getOperand(OpIdx);
1363     const TargetRegisterClass *OpRegCstraints =
1364         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1365 
1366     // If there's no constraint, there's nothing to do.
1367     if (!OpRegCstraints)
1368       continue;
1369     // If the operand is a frame index, there's nothing to do here.
1370     // A frame index operand will resolve correctly during PEI.
1371     if (MO.isFI())
1372       continue;
1373 
1374     assert(MO.isReg() &&
1375            "Operand has register constraints without being a register!");
1376 
1377     Register Reg = MO.getReg();
1378     if (Reg.isPhysical()) {
1379       if (!OpRegCstraints->contains(Reg))
1380         return false;
1381     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1382                !MRI->constrainRegClass(Reg, OpRegCstraints))
1383       return false;
1384   }
1385 
1386   return true;
1387 }
1388 
1389 /// Return the opcode that does not set flags when possible - otherwise
1390 /// return the original opcode. The caller is responsible to do the actual
1391 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1392 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1393   // Don't convert all compare instructions, because for some the zero register
1394   // encoding becomes the sp register.
1395   bool MIDefinesZeroReg = false;
1396   if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1397       MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1398     MIDefinesZeroReg = true;
1399 
1400   switch (MI.getOpcode()) {
1401   default:
1402     return MI.getOpcode();
1403   case AArch64::ADDSWrr:
1404     return AArch64::ADDWrr;
1405   case AArch64::ADDSWri:
1406     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1407   case AArch64::ADDSWrs:
1408     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1409   case AArch64::ADDSWrx:
1410     return AArch64::ADDWrx;
1411   case AArch64::ADDSXrr:
1412     return AArch64::ADDXrr;
1413   case AArch64::ADDSXri:
1414     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1415   case AArch64::ADDSXrs:
1416     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1417   case AArch64::ADDSXrx:
1418     return AArch64::ADDXrx;
1419   case AArch64::SUBSWrr:
1420     return AArch64::SUBWrr;
1421   case AArch64::SUBSWri:
1422     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1423   case AArch64::SUBSWrs:
1424     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1425   case AArch64::SUBSWrx:
1426     return AArch64::SUBWrx;
1427   case AArch64::SUBSXrr:
1428     return AArch64::SUBXrr;
1429   case AArch64::SUBSXri:
1430     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1431   case AArch64::SUBSXrs:
1432     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1433   case AArch64::SUBSXrx:
1434     return AArch64::SUBXrx;
1435   }
1436 }
1437 
1438 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1439 
1440 /// True when condition flags are accessed (either by writing or reading)
1441 /// on the instruction trace starting at From and ending at To.
1442 ///
1443 /// Note: If From and To are from different blocks it's assumed CC are accessed
1444 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1445 static bool areCFlagsAccessedBetweenInstrs(
1446     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1447     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1448   // Early exit if To is at the beginning of the BB.
1449   if (To == To->getParent()->begin())
1450     return true;
1451 
1452   // Check whether the instructions are in the same basic block
1453   // If not, assume the condition flags might get modified somewhere.
1454   if (To->getParent() != From->getParent())
1455     return true;
1456 
1457   // From must be above To.
1458   assert(std::any_of(
1459       ++To.getReverse(), To->getParent()->rend(),
1460       [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1461 
1462   // We iterate backward starting at \p To until we hit \p From.
1463   for (const MachineInstr &Instr :
1464        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1465     if (((AccessToCheck & AK_Write) &&
1466          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1467         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1468       return true;
1469   }
1470   return false;
1471 }
1472 
1473 std::optional<unsigned>
canRemovePTestInstr(MachineInstr * PTest,MachineInstr * Mask,MachineInstr * Pred,const MachineRegisterInfo * MRI) const1474 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1475                                       MachineInstr *Pred,
1476                                       const MachineRegisterInfo *MRI) const {
1477   unsigned MaskOpcode = Mask->getOpcode();
1478   unsigned PredOpcode = Pred->getOpcode();
1479   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1480   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1481 
1482   if (PredIsWhileLike) {
1483     // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1484     // instruction and the condition is "any" since WHILcc does an implicit
1485     // PTEST(ALL, PG) check and PG is always a subset of ALL.
1486     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1487       return PredOpcode;
1488 
1489     // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1490     // redundant since WHILE performs an implicit PTEST with an all active
1491     // mask.
1492     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1493         getElementSizeForOpcode(MaskOpcode) ==
1494             getElementSizeForOpcode(PredOpcode))
1495       return PredOpcode;
1496 
1497     return {};
1498   }
1499 
1500   if (PredIsPTestLike) {
1501     // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1502     // instruction that sets the flags as PTEST would and the condition is
1503     // "any" since PG is always a subset of the governing predicate of the
1504     // ptest-like instruction.
1505     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1506       return PredOpcode;
1507 
1508     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1509 
1510     // If the PTEST like instruction's general predicate is not `Mask`, attempt
1511     // to look through a copy and try again. This is because some instructions
1512     // take a predicate whose register class is a subset of its result class.
1513     if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1514         PTestLikeMask->getOperand(1).getReg().isVirtual())
1515       PTestLikeMask =
1516           MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1517 
1518     // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1519     // the element size matches and either the PTEST_LIKE instruction uses
1520     // the same all active mask or the condition is "any".
1521     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1522         getElementSizeForOpcode(MaskOpcode) ==
1523             getElementSizeForOpcode(PredOpcode)) {
1524       if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1525         return PredOpcode;
1526     }
1527 
1528     // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1529     // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1530     // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1531     // compare that also support 16/32/64-bit predicates, the implicit PTEST
1532     // performed by the compare could consider fewer lanes for these element
1533     // sizes.
1534     //
1535     // For example, consider
1536     //
1537     //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1538     //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1539     //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1540     //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1541     //                                 ;       ^ last active
1542     //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1543     //                                 ;     ^ last active
1544     //
1545     // where the compare generates a canonical all active 32-bit predicate
1546     // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1547     // active flag, whereas the PTEST instruction with the same mask doesn't.
1548     // For PTEST_ANY this doesn't apply as the flags in this case would be
1549     // identical regardless of element size.
1550     uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1551     if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1552                                   PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1553       return PredOpcode;
1554 
1555     return {};
1556   }
1557 
1558   // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1559   // opcode so the PTEST becomes redundant.
1560   switch (PredOpcode) {
1561   case AArch64::AND_PPzPP:
1562   case AArch64::BIC_PPzPP:
1563   case AArch64::EOR_PPzPP:
1564   case AArch64::NAND_PPzPP:
1565   case AArch64::NOR_PPzPP:
1566   case AArch64::ORN_PPzPP:
1567   case AArch64::ORR_PPzPP:
1568   case AArch64::BRKA_PPzP:
1569   case AArch64::BRKPA_PPzPP:
1570   case AArch64::BRKB_PPzP:
1571   case AArch64::BRKPB_PPzPP:
1572   case AArch64::RDFFR_PPz: {
1573     // Check to see if our mask is the same. If not the resulting flag bits
1574     // may be different and we can't remove the ptest.
1575     auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1576     if (Mask != PredMask)
1577       return {};
1578     break;
1579   }
1580   case AArch64::BRKN_PPzP: {
1581     // BRKN uses an all active implicit mask to set flags unlike the other
1582     // flag-setting instructions.
1583     // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1584     if ((MaskOpcode != AArch64::PTRUE_B) ||
1585         (Mask->getOperand(1).getImm() != 31))
1586       return {};
1587     break;
1588   }
1589   case AArch64::PTRUE_B:
1590     // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1591     break;
1592   default:
1593     // Bail out if we don't recognize the input
1594     return {};
1595   }
1596 
1597   return convertToFlagSettingOpc(PredOpcode);
1598 }
1599 
1600 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1601 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1602 bool AArch64InstrInfo::optimizePTestInstr(
1603     MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1604     const MachineRegisterInfo *MRI) const {
1605   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1606   auto *Pred = MRI->getUniqueVRegDef(PredReg);
1607   unsigned PredOpcode = Pred->getOpcode();
1608   auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1609   if (!NewOp)
1610     return false;
1611 
1612   const TargetRegisterInfo *TRI = &getRegisterInfo();
1613 
1614   // If another instruction between Pred and PTest accesses flags, don't remove
1615   // the ptest or update the earlier instruction to modify them.
1616   if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1617     return false;
1618 
1619   // If we pass all the checks, it's safe to remove the PTEST and use the flags
1620   // as they are prior to PTEST. Sometimes this requires the tested PTEST
1621   // operand to be replaced with an equivalent instruction that also sets the
1622   // flags.
1623   PTest->eraseFromParent();
1624   if (*NewOp != PredOpcode) {
1625     Pred->setDesc(get(*NewOp));
1626     bool succeeded = UpdateOperandRegClass(*Pred);
1627     (void)succeeded;
1628     assert(succeeded && "Operands have incompatible register classes!");
1629     Pred->addRegisterDefined(AArch64::NZCV, TRI);
1630   }
1631 
1632   // Ensure that the flags def is live.
1633   if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1634     unsigned i = 0, e = Pred->getNumOperands();
1635     for (; i != e; ++i) {
1636       MachineOperand &MO = Pred->getOperand(i);
1637       if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1638         MO.setIsDead(false);
1639         break;
1640       }
1641     }
1642   }
1643   return true;
1644 }
1645 
1646 /// Try to optimize a compare instruction. A compare instruction is an
1647 /// instruction which produces AArch64::NZCV. It can be truly compare
1648 /// instruction
1649 /// when there are no uses of its destination register.
1650 ///
1651 /// The following steps are tried in order:
1652 /// 1. Convert CmpInstr into an unconditional version.
1653 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1654 ///    condition code or an instruction which can be converted into such an
1655 ///    instruction.
1656 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1657 bool AArch64InstrInfo::optimizeCompareInstr(
1658     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1659     int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1660   assert(CmpInstr.getParent());
1661   assert(MRI);
1662 
1663   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1664   int DeadNZCVIdx =
1665       CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1666   if (DeadNZCVIdx != -1) {
1667     if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1668         CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1669       CmpInstr.eraseFromParent();
1670       return true;
1671     }
1672     unsigned Opc = CmpInstr.getOpcode();
1673     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1674     if (NewOpc == Opc)
1675       return false;
1676     const MCInstrDesc &MCID = get(NewOpc);
1677     CmpInstr.setDesc(MCID);
1678     CmpInstr.removeOperand(DeadNZCVIdx);
1679     bool succeeded = UpdateOperandRegClass(CmpInstr);
1680     (void)succeeded;
1681     assert(succeeded && "Some operands reg class are incompatible!");
1682     return true;
1683   }
1684 
1685   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1686       CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1687     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1688 
1689   if (SrcReg2 != 0)
1690     return false;
1691 
1692   // CmpInstr is a Compare instruction if destination register is not used.
1693   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1694     return false;
1695 
1696   if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1697     return true;
1698   return (CmpValue == 0 || CmpValue == 1) &&
1699          removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1700 }
1701 
1702 /// Get opcode of S version of Instr.
1703 /// If Instr is S version its opcode is returned.
1704 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1705 /// or we are not interested in it.
sForm(MachineInstr & Instr)1706 static unsigned sForm(MachineInstr &Instr) {
1707   switch (Instr.getOpcode()) {
1708   default:
1709     return AArch64::INSTRUCTION_LIST_END;
1710 
1711   case AArch64::ADDSWrr:
1712   case AArch64::ADDSWri:
1713   case AArch64::ADDSXrr:
1714   case AArch64::ADDSXri:
1715   case AArch64::SUBSWrr:
1716   case AArch64::SUBSWri:
1717   case AArch64::SUBSXrr:
1718   case AArch64::SUBSXri:
1719     return Instr.getOpcode();
1720 
1721   case AArch64::ADDWrr:
1722     return AArch64::ADDSWrr;
1723   case AArch64::ADDWri:
1724     return AArch64::ADDSWri;
1725   case AArch64::ADDXrr:
1726     return AArch64::ADDSXrr;
1727   case AArch64::ADDXri:
1728     return AArch64::ADDSXri;
1729   case AArch64::ADCWr:
1730     return AArch64::ADCSWr;
1731   case AArch64::ADCXr:
1732     return AArch64::ADCSXr;
1733   case AArch64::SUBWrr:
1734     return AArch64::SUBSWrr;
1735   case AArch64::SUBWri:
1736     return AArch64::SUBSWri;
1737   case AArch64::SUBXrr:
1738     return AArch64::SUBSXrr;
1739   case AArch64::SUBXri:
1740     return AArch64::SUBSXri;
1741   case AArch64::SBCWr:
1742     return AArch64::SBCSWr;
1743   case AArch64::SBCXr:
1744     return AArch64::SBCSXr;
1745   case AArch64::ANDWri:
1746     return AArch64::ANDSWri;
1747   case AArch64::ANDXri:
1748     return AArch64::ANDSXri;
1749   }
1750 }
1751 
1752 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1753 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1754   for (auto *BB : MBB->successors())
1755     if (BB->isLiveIn(AArch64::NZCV))
1756       return true;
1757   return false;
1758 }
1759 
1760 /// \returns The condition code operand index for \p Instr if it is a branch
1761 /// or select and -1 otherwise.
1762 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1763 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1764   switch (Instr.getOpcode()) {
1765   default:
1766     return -1;
1767 
1768   case AArch64::Bcc: {
1769     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1770     assert(Idx >= 2);
1771     return Idx - 2;
1772   }
1773 
1774   case AArch64::CSINVWr:
1775   case AArch64::CSINVXr:
1776   case AArch64::CSINCWr:
1777   case AArch64::CSINCXr:
1778   case AArch64::CSELWr:
1779   case AArch64::CSELXr:
1780   case AArch64::CSNEGWr:
1781   case AArch64::CSNEGXr:
1782   case AArch64::FCSELSrrr:
1783   case AArch64::FCSELDrrr: {
1784     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1785     assert(Idx >= 1);
1786     return Idx - 1;
1787   }
1788   }
1789 }
1790 
1791 /// Find a condition code used by the instruction.
1792 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1793 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1794 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1795   int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1796   return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1797                           Instr.getOperand(CCIdx).getImm())
1798                     : AArch64CC::Invalid;
1799 }
1800 
getUsedNZCV(AArch64CC::CondCode CC)1801 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1802   assert(CC != AArch64CC::Invalid);
1803   UsedNZCV UsedFlags;
1804   switch (CC) {
1805   default:
1806     break;
1807 
1808   case AArch64CC::EQ: // Z set
1809   case AArch64CC::NE: // Z clear
1810     UsedFlags.Z = true;
1811     break;
1812 
1813   case AArch64CC::HI: // Z clear and C set
1814   case AArch64CC::LS: // Z set   or  C clear
1815     UsedFlags.Z = true;
1816     [[fallthrough]];
1817   case AArch64CC::HS: // C set
1818   case AArch64CC::LO: // C clear
1819     UsedFlags.C = true;
1820     break;
1821 
1822   case AArch64CC::MI: // N set
1823   case AArch64CC::PL: // N clear
1824     UsedFlags.N = true;
1825     break;
1826 
1827   case AArch64CC::VS: // V set
1828   case AArch64CC::VC: // V clear
1829     UsedFlags.V = true;
1830     break;
1831 
1832   case AArch64CC::GT: // Z clear, N and V the same
1833   case AArch64CC::LE: // Z set,   N and V differ
1834     UsedFlags.Z = true;
1835     [[fallthrough]];
1836   case AArch64CC::GE: // N and V the same
1837   case AArch64CC::LT: // N and V differ
1838     UsedFlags.N = true;
1839     UsedFlags.V = true;
1840     break;
1841   }
1842   return UsedFlags;
1843 }
1844 
1845 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1846 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1847 /// \returns std::nullopt otherwise.
1848 ///
1849 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1850 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1851 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1852                        const TargetRegisterInfo &TRI,
1853                        SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1854   MachineBasicBlock *CmpParent = CmpInstr.getParent();
1855   if (MI.getParent() != CmpParent)
1856     return std::nullopt;
1857 
1858   if (areCFlagsAliveInSuccessors(CmpParent))
1859     return std::nullopt;
1860 
1861   UsedNZCV NZCVUsedAfterCmp;
1862   for (MachineInstr &Instr : instructionsWithoutDebug(
1863            std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1864     if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1865       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1866       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1867         return std::nullopt;
1868       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1869       if (CCUseInstrs)
1870         CCUseInstrs->push_back(&Instr);
1871     }
1872     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1873       break;
1874   }
1875   return NZCVUsedAfterCmp;
1876 }
1877 
isADDSRegImm(unsigned Opcode)1878 static bool isADDSRegImm(unsigned Opcode) {
1879   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1880 }
1881 
isSUBSRegImm(unsigned Opcode)1882 static bool isSUBSRegImm(unsigned Opcode) {
1883   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1884 }
1885 
1886 /// Check if CmpInstr can be substituted by MI.
1887 ///
1888 /// CmpInstr can be substituted:
1889 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1890 /// - and, MI and CmpInstr are from the same MachineBB
1891 /// - and, condition flags are not alive in successors of the CmpInstr parent
1892 /// - and, if MI opcode is the S form there must be no defs of flags between
1893 ///        MI and CmpInstr
1894 ///        or if MI opcode is not the S form there must be neither defs of flags
1895 ///        nor uses of flags between MI and CmpInstr.
1896 /// - and, if C/V flags are not used after CmpInstr
1897 ///        or if N flag is used but MI produces poison value if signed overflow
1898 ///        occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1899 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1900                                        const TargetRegisterInfo &TRI) {
1901   // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1902   // that may or may not set flags.
1903   assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1904 
1905   const unsigned CmpOpcode = CmpInstr.getOpcode();
1906   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1907     return false;
1908 
1909   assert((CmpInstr.getOperand(2).isImm() &&
1910           CmpInstr.getOperand(2).getImm() == 0) &&
1911          "Caller guarantees that CmpInstr compares with constant 0");
1912 
1913   std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1914   if (!NZVCUsed || NZVCUsed->C)
1915     return false;
1916 
1917   // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1918   // '%vreg = add ...' or '%vreg = sub ...'.
1919   // Condition flag V is used to indicate signed overflow.
1920   // 1) MI and CmpInstr set N and V to the same value.
1921   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1922   //    signed overflow occurs, so CmpInstr could still be simplified away.
1923   if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1924     return false;
1925 
1926   AccessKind AccessToCheck = AK_Write;
1927   if (sForm(MI) != MI.getOpcode())
1928     AccessToCheck = AK_All;
1929   return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1930 }
1931 
1932 /// Substitute an instruction comparing to zero with another instruction
1933 /// which produces needed condition flags.
1934 ///
1935 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1936 bool AArch64InstrInfo::substituteCmpToZero(
1937     MachineInstr &CmpInstr, unsigned SrcReg,
1938     const MachineRegisterInfo &MRI) const {
1939   // Get the unique definition of SrcReg.
1940   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1941   if (!MI)
1942     return false;
1943 
1944   const TargetRegisterInfo &TRI = getRegisterInfo();
1945 
1946   unsigned NewOpc = sForm(*MI);
1947   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1948     return false;
1949 
1950   if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1951     return false;
1952 
1953   // Update the instruction to set NZCV.
1954   MI->setDesc(get(NewOpc));
1955   CmpInstr.eraseFromParent();
1956   bool succeeded = UpdateOperandRegClass(*MI);
1957   (void)succeeded;
1958   assert(succeeded && "Some operands reg class are incompatible!");
1959   MI->addRegisterDefined(AArch64::NZCV, &TRI);
1960   return true;
1961 }
1962 
1963 /// \returns True if \p CmpInstr can be removed.
1964 ///
1965 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1966 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1967 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1968                                  int CmpValue, const TargetRegisterInfo &TRI,
1969                                  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1970                                  bool &IsInvertCC) {
1971   assert((CmpValue == 0 || CmpValue == 1) &&
1972          "Only comparisons to 0 or 1 considered for removal!");
1973 
1974   // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1975   unsigned MIOpc = MI.getOpcode();
1976   if (MIOpc == AArch64::CSINCWr) {
1977     if (MI.getOperand(1).getReg() != AArch64::WZR ||
1978         MI.getOperand(2).getReg() != AArch64::WZR)
1979       return false;
1980   } else if (MIOpc == AArch64::CSINCXr) {
1981     if (MI.getOperand(1).getReg() != AArch64::XZR ||
1982         MI.getOperand(2).getReg() != AArch64::XZR)
1983       return false;
1984   } else {
1985     return false;
1986   }
1987   AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1988   if (MICC == AArch64CC::Invalid)
1989     return false;
1990 
1991   // NZCV needs to be defined
1992   if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1993     return false;
1994 
1995   // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1996   const unsigned CmpOpcode = CmpInstr.getOpcode();
1997   bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1998   if (CmpValue && !IsSubsRegImm)
1999     return false;
2000   if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2001     return false;
2002 
2003   // MI conditions allowed: eq, ne, mi, pl
2004   UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2005   if (MIUsedNZCV.C || MIUsedNZCV.V)
2006     return false;
2007 
2008   std::optional<UsedNZCV> NZCVUsedAfterCmp =
2009       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2010   // Condition flags are not used in CmpInstr basic block successors and only
2011   // Z or N flags allowed to be used after CmpInstr within its basic block
2012   if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2013     return false;
2014   // Z or N flag used after CmpInstr must correspond to the flag used in MI
2015   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2016       (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2017     return false;
2018   // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2019   if (MIUsedNZCV.N && !CmpValue)
2020     return false;
2021 
2022   // There must be no defs of flags between MI and CmpInstr
2023   if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2024     return false;
2025 
2026   // Condition code is inverted in the following cases:
2027   // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2028   // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2029   IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2030                (!CmpValue && MICC == AArch64CC::NE);
2031   return true;
2032 }
2033 
2034 /// Remove comparison in csinc-cmp sequence
2035 ///
2036 /// Examples:
2037 /// 1. \code
2038 ///   csinc w9, wzr, wzr, ne
2039 ///   cmp   w9, #0
2040 ///   b.eq
2041 ///    \endcode
2042 /// to
2043 ///    \code
2044 ///   csinc w9, wzr, wzr, ne
2045 ///   b.ne
2046 ///    \endcode
2047 ///
2048 /// 2. \code
2049 ///   csinc x2, xzr, xzr, mi
2050 ///   cmp   x2, #1
2051 ///   b.pl
2052 ///    \endcode
2053 /// to
2054 ///    \code
2055 ///   csinc x2, xzr, xzr, mi
2056 ///   b.pl
2057 ///    \endcode
2058 ///
2059 /// \param  CmpInstr comparison instruction
2060 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const2061 bool AArch64InstrInfo::removeCmpToZeroOrOne(
2062     MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2063     const MachineRegisterInfo &MRI) const {
2064   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2065   if (!MI)
2066     return false;
2067   const TargetRegisterInfo &TRI = getRegisterInfo();
2068   SmallVector<MachineInstr *, 4> CCUseInstrs;
2069   bool IsInvertCC = false;
2070   if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2071                             IsInvertCC))
2072     return false;
2073   // Make transformation
2074   CmpInstr.eraseFromParent();
2075   if (IsInvertCC) {
2076     // Invert condition codes in CmpInstr CC users
2077     for (MachineInstr *CCUseInstr : CCUseInstrs) {
2078       int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2079       assert(Idx >= 0 && "Unexpected instruction using CC.");
2080       MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2081       AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2082           static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2083       CCOperand.setImm(CCUse);
2084     }
2085   }
2086   return true;
2087 }
2088 
expandPostRAPseudo(MachineInstr & MI) const2089 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2090   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2091       MI.getOpcode() != AArch64::CATCHRET)
2092     return false;
2093 
2094   MachineBasicBlock &MBB = *MI.getParent();
2095   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2096   auto TRI = Subtarget.getRegisterInfo();
2097   DebugLoc DL = MI.getDebugLoc();
2098 
2099   if (MI.getOpcode() == AArch64::CATCHRET) {
2100     // Skip to the first instruction before the epilog.
2101     const TargetInstrInfo *TII =
2102       MBB.getParent()->getSubtarget().getInstrInfo();
2103     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2104     auto MBBI = MachineBasicBlock::iterator(MI);
2105     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2106     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2107            FirstEpilogSEH != MBB.begin())
2108       FirstEpilogSEH = std::prev(FirstEpilogSEH);
2109     if (FirstEpilogSEH != MBB.begin())
2110       FirstEpilogSEH = std::next(FirstEpilogSEH);
2111     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2112         .addReg(AArch64::X0, RegState::Define)
2113         .addMBB(TargetMBB);
2114     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2115         .addReg(AArch64::X0, RegState::Define)
2116         .addReg(AArch64::X0)
2117         .addMBB(TargetMBB)
2118         .addImm(0);
2119     TargetMBB->setMachineBlockAddressTaken();
2120     return true;
2121   }
2122 
2123   Register Reg = MI.getOperand(0).getReg();
2124   Module &M = *MBB.getParent()->getFunction().getParent();
2125   if (M.getStackProtectorGuard() == "sysreg") {
2126     const AArch64SysReg::SysReg *SrcReg =
2127         AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2128     if (!SrcReg)
2129       report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2130 
2131     // mrs xN, sysreg
2132     BuildMI(MBB, MI, DL, get(AArch64::MRS))
2133         .addDef(Reg, RegState::Renamable)
2134         .addImm(SrcReg->Encoding);
2135     int Offset = M.getStackProtectorGuardOffset();
2136     if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2137       // ldr xN, [xN, #offset]
2138       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2139           .addDef(Reg)
2140           .addUse(Reg, RegState::Kill)
2141           .addImm(Offset / 8);
2142     } else if (Offset >= -256 && Offset <= 255) {
2143       // ldur xN, [xN, #offset]
2144       BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2145           .addDef(Reg)
2146           .addUse(Reg, RegState::Kill)
2147           .addImm(Offset);
2148     } else if (Offset >= -4095 && Offset <= 4095) {
2149       if (Offset > 0) {
2150         // add xN, xN, #offset
2151         BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2152             .addDef(Reg)
2153             .addUse(Reg, RegState::Kill)
2154             .addImm(Offset)
2155             .addImm(0);
2156       } else {
2157         // sub xN, xN, #offset
2158         BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2159             .addDef(Reg)
2160             .addUse(Reg, RegState::Kill)
2161             .addImm(-Offset)
2162             .addImm(0);
2163       }
2164       // ldr xN, [xN]
2165       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2166           .addDef(Reg)
2167           .addUse(Reg, RegState::Kill)
2168           .addImm(0);
2169     } else {
2170       // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2171       // than 23760.
2172       // It might be nice to use AArch64::MOVi32imm here, which would get
2173       // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2174       // contains the MRS result. findScratchNonCalleeSaveRegister() in
2175       // AArch64FrameLowering might help us find such a scratch register
2176       // though. If we failed to find a scratch register, we could emit a
2177       // stream of add instructions to build up the immediate. Or, we could try
2178       // to insert a AArch64::MOVi32imm before register allocation so that we
2179       // didn't need to scavenge for a scratch register.
2180       report_fatal_error("Unable to encode Stack Protector Guard Offset");
2181     }
2182     MBB.erase(MI);
2183     return true;
2184   }
2185 
2186   const GlobalValue *GV =
2187       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2188   const TargetMachine &TM = MBB.getParent()->getTarget();
2189   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2190   const unsigned char MO_NC = AArch64II::MO_NC;
2191 
2192   if ((OpFlags & AArch64II::MO_GOT) != 0) {
2193     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2194         .addGlobalAddress(GV, 0, OpFlags);
2195     if (Subtarget.isTargetILP32()) {
2196       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2197       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2198           .addDef(Reg32, RegState::Dead)
2199           .addUse(Reg, RegState::Kill)
2200           .addImm(0)
2201           .addMemOperand(*MI.memoperands_begin())
2202           .addDef(Reg, RegState::Implicit);
2203     } else {
2204       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2205           .addReg(Reg, RegState::Kill)
2206           .addImm(0)
2207           .addMemOperand(*MI.memoperands_begin());
2208     }
2209   } else if (TM.getCodeModel() == CodeModel::Large) {
2210     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2211     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2212         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2213         .addImm(0);
2214     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2215         .addReg(Reg, RegState::Kill)
2216         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2217         .addImm(16);
2218     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2219         .addReg(Reg, RegState::Kill)
2220         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2221         .addImm(32);
2222     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2223         .addReg(Reg, RegState::Kill)
2224         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2225         .addImm(48);
2226     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2227         .addReg(Reg, RegState::Kill)
2228         .addImm(0)
2229         .addMemOperand(*MI.memoperands_begin());
2230   } else if (TM.getCodeModel() == CodeModel::Tiny) {
2231     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2232         .addGlobalAddress(GV, 0, OpFlags);
2233   } else {
2234     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2235         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2236     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2237     if (Subtarget.isTargetILP32()) {
2238       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2239       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2240           .addDef(Reg32, RegState::Dead)
2241           .addUse(Reg, RegState::Kill)
2242           .addGlobalAddress(GV, 0, LoFlags)
2243           .addMemOperand(*MI.memoperands_begin())
2244           .addDef(Reg, RegState::Implicit);
2245     } else {
2246       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2247           .addReg(Reg, RegState::Kill)
2248           .addGlobalAddress(GV, 0, LoFlags)
2249           .addMemOperand(*MI.memoperands_begin());
2250     }
2251   }
2252 
2253   MBB.erase(MI);
2254 
2255   return true;
2256 }
2257 
2258 // Return true if this instruction simply sets its single destination register
2259 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2260 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2261   switch (MI.getOpcode()) {
2262   default:
2263     break;
2264   case AArch64::MOVZWi:
2265   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2266     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2267       assert(MI.getDesc().getNumOperands() == 3 &&
2268              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2269       return true;
2270     }
2271     break;
2272   case AArch64::ANDWri: // and Rd, Rzr, #imm
2273     return MI.getOperand(1).getReg() == AArch64::WZR;
2274   case AArch64::ANDXri:
2275     return MI.getOperand(1).getReg() == AArch64::XZR;
2276   case TargetOpcode::COPY:
2277     return MI.getOperand(1).getReg() == AArch64::WZR;
2278   }
2279   return false;
2280 }
2281 
2282 // Return true if this instruction simply renames a general register without
2283 // modifying bits.
isGPRCopy(const MachineInstr & MI)2284 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2285   switch (MI.getOpcode()) {
2286   default:
2287     break;
2288   case TargetOpcode::COPY: {
2289     // GPR32 copies will by lowered to ORRXrs
2290     Register DstReg = MI.getOperand(0).getReg();
2291     return (AArch64::GPR32RegClass.contains(DstReg) ||
2292             AArch64::GPR64RegClass.contains(DstReg));
2293   }
2294   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2295     if (MI.getOperand(1).getReg() == AArch64::XZR) {
2296       assert(MI.getDesc().getNumOperands() == 4 &&
2297              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2298       return true;
2299     }
2300     break;
2301   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2302     if (MI.getOperand(2).getImm() == 0) {
2303       assert(MI.getDesc().getNumOperands() == 4 &&
2304              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2305       return true;
2306     }
2307     break;
2308   }
2309   return false;
2310 }
2311 
2312 // Return true if this instruction simply renames a general register without
2313 // modifying bits.
isFPRCopy(const MachineInstr & MI)2314 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2315   switch (MI.getOpcode()) {
2316   default:
2317     break;
2318   case TargetOpcode::COPY: {
2319     Register DstReg = MI.getOperand(0).getReg();
2320     return AArch64::FPR128RegClass.contains(DstReg);
2321   }
2322   case AArch64::ORRv16i8:
2323     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2324       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2325              "invalid ORRv16i8 operands");
2326       return true;
2327     }
2328     break;
2329   }
2330   return false;
2331 }
2332 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2333 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2334                                                int &FrameIndex) const {
2335   switch (MI.getOpcode()) {
2336   default:
2337     break;
2338   case AArch64::LDRWui:
2339   case AArch64::LDRXui:
2340   case AArch64::LDRBui:
2341   case AArch64::LDRHui:
2342   case AArch64::LDRSui:
2343   case AArch64::LDRDui:
2344   case AArch64::LDRQui:
2345   case AArch64::LDR_PXI:
2346     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2347         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2348       FrameIndex = MI.getOperand(1).getIndex();
2349       return MI.getOperand(0).getReg();
2350     }
2351     break;
2352   }
2353 
2354   return 0;
2355 }
2356 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2357 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2358                                               int &FrameIndex) const {
2359   switch (MI.getOpcode()) {
2360   default:
2361     break;
2362   case AArch64::STRWui:
2363   case AArch64::STRXui:
2364   case AArch64::STRBui:
2365   case AArch64::STRHui:
2366   case AArch64::STRSui:
2367   case AArch64::STRDui:
2368   case AArch64::STRQui:
2369   case AArch64::STR_PXI:
2370     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2371         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2372       FrameIndex = MI.getOperand(1).getIndex();
2373       return MI.getOperand(0).getReg();
2374     }
2375     break;
2376   }
2377   return 0;
2378 }
2379 
2380 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2381 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2382   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2383     return MMO->getFlags() & MOSuppressPair;
2384   });
2385 }
2386 
2387 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2388 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2389   if (MI.memoperands_empty())
2390     return;
2391   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2392 }
2393 
2394 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2395 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2396   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2397     return MMO->getFlags() & MOStridedAccess;
2398   });
2399 }
2400 
hasUnscaledLdStOffset(unsigned Opc)2401 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2402   switch (Opc) {
2403   default:
2404     return false;
2405   case AArch64::STURSi:
2406   case AArch64::STRSpre:
2407   case AArch64::STURDi:
2408   case AArch64::STRDpre:
2409   case AArch64::STURQi:
2410   case AArch64::STRQpre:
2411   case AArch64::STURBBi:
2412   case AArch64::STURHHi:
2413   case AArch64::STURWi:
2414   case AArch64::STRWpre:
2415   case AArch64::STURXi:
2416   case AArch64::STRXpre:
2417   case AArch64::LDURSi:
2418   case AArch64::LDRSpre:
2419   case AArch64::LDURDi:
2420   case AArch64::LDRDpre:
2421   case AArch64::LDURQi:
2422   case AArch64::LDRQpre:
2423   case AArch64::LDURWi:
2424   case AArch64::LDRWpre:
2425   case AArch64::LDURXi:
2426   case AArch64::LDRXpre:
2427   case AArch64::LDRSWpre:
2428   case AArch64::LDURSWi:
2429   case AArch64::LDURHHi:
2430   case AArch64::LDURBBi:
2431   case AArch64::LDURSBWi:
2432   case AArch64::LDURSHWi:
2433     return true;
2434   }
2435 }
2436 
getUnscaledLdSt(unsigned Opc)2437 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2438   switch (Opc) {
2439   default: return {};
2440   case AArch64::PRFMui: return AArch64::PRFUMi;
2441   case AArch64::LDRXui: return AArch64::LDURXi;
2442   case AArch64::LDRWui: return AArch64::LDURWi;
2443   case AArch64::LDRBui: return AArch64::LDURBi;
2444   case AArch64::LDRHui: return AArch64::LDURHi;
2445   case AArch64::LDRSui: return AArch64::LDURSi;
2446   case AArch64::LDRDui: return AArch64::LDURDi;
2447   case AArch64::LDRQui: return AArch64::LDURQi;
2448   case AArch64::LDRBBui: return AArch64::LDURBBi;
2449   case AArch64::LDRHHui: return AArch64::LDURHHi;
2450   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2451   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2452   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2453   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2454   case AArch64::LDRSWui: return AArch64::LDURSWi;
2455   case AArch64::STRXui: return AArch64::STURXi;
2456   case AArch64::STRWui: return AArch64::STURWi;
2457   case AArch64::STRBui: return AArch64::STURBi;
2458   case AArch64::STRHui: return AArch64::STURHi;
2459   case AArch64::STRSui: return AArch64::STURSi;
2460   case AArch64::STRDui: return AArch64::STURDi;
2461   case AArch64::STRQui: return AArch64::STURQi;
2462   case AArch64::STRBBui: return AArch64::STURBBi;
2463   case AArch64::STRHHui: return AArch64::STURHHi;
2464   }
2465 }
2466 
getLoadStoreImmIdx(unsigned Opc)2467 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2468   switch (Opc) {
2469   default:
2470     llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2471   case AArch64::ADDG:
2472   case AArch64::LDAPURBi:
2473   case AArch64::LDAPURHi:
2474   case AArch64::LDAPURi:
2475   case AArch64::LDAPURSBWi:
2476   case AArch64::LDAPURSBXi:
2477   case AArch64::LDAPURSHWi:
2478   case AArch64::LDAPURSHXi:
2479   case AArch64::LDAPURSWi:
2480   case AArch64::LDAPURXi:
2481   case AArch64::LDR_PPXI:
2482   case AArch64::LDR_PXI:
2483   case AArch64::LDR_ZXI:
2484   case AArch64::LDR_ZZXI:
2485   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2486   case AArch64::LDR_ZZZXI:
2487   case AArch64::LDR_ZZZZXI:
2488   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2489   case AArch64::LDRBBui:
2490   case AArch64::LDRBui:
2491   case AArch64::LDRDui:
2492   case AArch64::LDRHHui:
2493   case AArch64::LDRHui:
2494   case AArch64::LDRQui:
2495   case AArch64::LDRSBWui:
2496   case AArch64::LDRSBXui:
2497   case AArch64::LDRSHWui:
2498   case AArch64::LDRSHXui:
2499   case AArch64::LDRSui:
2500   case AArch64::LDRSWui:
2501   case AArch64::LDRWui:
2502   case AArch64::LDRXui:
2503   case AArch64::LDURBBi:
2504   case AArch64::LDURBi:
2505   case AArch64::LDURDi:
2506   case AArch64::LDURHHi:
2507   case AArch64::LDURHi:
2508   case AArch64::LDURQi:
2509   case AArch64::LDURSBWi:
2510   case AArch64::LDURSBXi:
2511   case AArch64::LDURSHWi:
2512   case AArch64::LDURSHXi:
2513   case AArch64::LDURSi:
2514   case AArch64::LDURSWi:
2515   case AArch64::LDURWi:
2516   case AArch64::LDURXi:
2517   case AArch64::PRFMui:
2518   case AArch64::PRFUMi:
2519   case AArch64::ST2Gi:
2520   case AArch64::STGi:
2521   case AArch64::STLURBi:
2522   case AArch64::STLURHi:
2523   case AArch64::STLURWi:
2524   case AArch64::STLURXi:
2525   case AArch64::StoreSwiftAsyncContext:
2526   case AArch64::STR_PPXI:
2527   case AArch64::STR_PXI:
2528   case AArch64::STR_ZXI:
2529   case AArch64::STR_ZZXI:
2530   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2531   case AArch64::STR_ZZZXI:
2532   case AArch64::STR_ZZZZXI:
2533   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2534   case AArch64::STRBBui:
2535   case AArch64::STRBui:
2536   case AArch64::STRDui:
2537   case AArch64::STRHHui:
2538   case AArch64::STRHui:
2539   case AArch64::STRQui:
2540   case AArch64::STRSui:
2541   case AArch64::STRWui:
2542   case AArch64::STRXui:
2543   case AArch64::STURBBi:
2544   case AArch64::STURBi:
2545   case AArch64::STURDi:
2546   case AArch64::STURHHi:
2547   case AArch64::STURHi:
2548   case AArch64::STURQi:
2549   case AArch64::STURSi:
2550   case AArch64::STURWi:
2551   case AArch64::STURXi:
2552   case AArch64::STZ2Gi:
2553   case AArch64::STZGi:
2554   case AArch64::TAGPstack:
2555   case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
2556   case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
2557     return 2;
2558   case AArch64::LD1B_D_IMM:
2559   case AArch64::LD1B_H_IMM:
2560   case AArch64::LD1B_IMM:
2561   case AArch64::LD1B_S_IMM:
2562   case AArch64::LD1D_IMM:
2563   case AArch64::LD1H_D_IMM:
2564   case AArch64::LD1H_IMM:
2565   case AArch64::LD1H_S_IMM:
2566   case AArch64::LD1RB_D_IMM:
2567   case AArch64::LD1RB_H_IMM:
2568   case AArch64::LD1RB_IMM:
2569   case AArch64::LD1RB_S_IMM:
2570   case AArch64::LD1RD_IMM:
2571   case AArch64::LD1RH_D_IMM:
2572   case AArch64::LD1RH_IMM:
2573   case AArch64::LD1RH_S_IMM:
2574   case AArch64::LD1RSB_D_IMM:
2575   case AArch64::LD1RSB_H_IMM:
2576   case AArch64::LD1RSB_S_IMM:
2577   case AArch64::LD1RSH_D_IMM:
2578   case AArch64::LD1RSH_S_IMM:
2579   case AArch64::LD1RSW_IMM:
2580   case AArch64::LD1RW_D_IMM:
2581   case AArch64::LD1RW_IMM:
2582   case AArch64::LD1SB_D_IMM:
2583   case AArch64::LD1SB_H_IMM:
2584   case AArch64::LD1SB_S_IMM:
2585   case AArch64::LD1SH_D_IMM:
2586   case AArch64::LD1SH_S_IMM:
2587   case AArch64::LD1SW_D_IMM:
2588   case AArch64::LD1W_D_IMM:
2589   case AArch64::LD1W_IMM:
2590   case AArch64::LD2B_IMM:
2591   case AArch64::LD2D_IMM:
2592   case AArch64::LD2H_IMM:
2593   case AArch64::LD2W_IMM:
2594   case AArch64::LD3B_IMM:
2595   case AArch64::LD3D_IMM:
2596   case AArch64::LD3H_IMM:
2597   case AArch64::LD3W_IMM:
2598   case AArch64::LD4B_IMM:
2599   case AArch64::LD4D_IMM:
2600   case AArch64::LD4H_IMM:
2601   case AArch64::LD4W_IMM:
2602   case AArch64::LDG:
2603   case AArch64::LDNF1B_D_IMM:
2604   case AArch64::LDNF1B_H_IMM:
2605   case AArch64::LDNF1B_IMM:
2606   case AArch64::LDNF1B_S_IMM:
2607   case AArch64::LDNF1D_IMM:
2608   case AArch64::LDNF1H_D_IMM:
2609   case AArch64::LDNF1H_IMM:
2610   case AArch64::LDNF1H_S_IMM:
2611   case AArch64::LDNF1SB_D_IMM:
2612   case AArch64::LDNF1SB_H_IMM:
2613   case AArch64::LDNF1SB_S_IMM:
2614   case AArch64::LDNF1SH_D_IMM:
2615   case AArch64::LDNF1SH_S_IMM:
2616   case AArch64::LDNF1SW_D_IMM:
2617   case AArch64::LDNF1W_D_IMM:
2618   case AArch64::LDNF1W_IMM:
2619   case AArch64::LDNPDi:
2620   case AArch64::LDNPQi:
2621   case AArch64::LDNPSi:
2622   case AArch64::LDNPWi:
2623   case AArch64::LDNPXi:
2624   case AArch64::LDNT1B_ZRI:
2625   case AArch64::LDNT1D_ZRI:
2626   case AArch64::LDNT1H_ZRI:
2627   case AArch64::LDNT1W_ZRI:
2628   case AArch64::LDPDi:
2629   case AArch64::LDPQi:
2630   case AArch64::LDPSi:
2631   case AArch64::LDPWi:
2632   case AArch64::LDPXi:
2633   case AArch64::LDRBBpost:
2634   case AArch64::LDRBBpre:
2635   case AArch64::LDRBpost:
2636   case AArch64::LDRBpre:
2637   case AArch64::LDRDpost:
2638   case AArch64::LDRDpre:
2639   case AArch64::LDRHHpost:
2640   case AArch64::LDRHHpre:
2641   case AArch64::LDRHpost:
2642   case AArch64::LDRHpre:
2643   case AArch64::LDRQpost:
2644   case AArch64::LDRQpre:
2645   case AArch64::LDRSpost:
2646   case AArch64::LDRSpre:
2647   case AArch64::LDRWpost:
2648   case AArch64::LDRWpre:
2649   case AArch64::LDRXpost:
2650   case AArch64::LDRXpre:
2651   case AArch64::ST1B_D_IMM:
2652   case AArch64::ST1B_H_IMM:
2653   case AArch64::ST1B_IMM:
2654   case AArch64::ST1B_S_IMM:
2655   case AArch64::ST1D_IMM:
2656   case AArch64::ST1H_D_IMM:
2657   case AArch64::ST1H_IMM:
2658   case AArch64::ST1H_S_IMM:
2659   case AArch64::ST1W_D_IMM:
2660   case AArch64::ST1W_IMM:
2661   case AArch64::ST2B_IMM:
2662   case AArch64::ST2D_IMM:
2663   case AArch64::ST2H_IMM:
2664   case AArch64::ST2W_IMM:
2665   case AArch64::ST3B_IMM:
2666   case AArch64::ST3D_IMM:
2667   case AArch64::ST3H_IMM:
2668   case AArch64::ST3W_IMM:
2669   case AArch64::ST4B_IMM:
2670   case AArch64::ST4D_IMM:
2671   case AArch64::ST4H_IMM:
2672   case AArch64::ST4W_IMM:
2673   case AArch64::STGPi:
2674   case AArch64::STGPreIndex:
2675   case AArch64::STZGPreIndex:
2676   case AArch64::ST2GPreIndex:
2677   case AArch64::STZ2GPreIndex:
2678   case AArch64::STGPostIndex:
2679   case AArch64::STZGPostIndex:
2680   case AArch64::ST2GPostIndex:
2681   case AArch64::STZ2GPostIndex:
2682   case AArch64::STNPDi:
2683   case AArch64::STNPQi:
2684   case AArch64::STNPSi:
2685   case AArch64::STNPWi:
2686   case AArch64::STNPXi:
2687   case AArch64::STNT1B_ZRI:
2688   case AArch64::STNT1D_ZRI:
2689   case AArch64::STNT1H_ZRI:
2690   case AArch64::STNT1W_ZRI:
2691   case AArch64::STPDi:
2692   case AArch64::STPQi:
2693   case AArch64::STPSi:
2694   case AArch64::STPWi:
2695   case AArch64::STPXi:
2696   case AArch64::STRBBpost:
2697   case AArch64::STRBBpre:
2698   case AArch64::STRBpost:
2699   case AArch64::STRBpre:
2700   case AArch64::STRDpost:
2701   case AArch64::STRDpre:
2702   case AArch64::STRHHpost:
2703   case AArch64::STRHHpre:
2704   case AArch64::STRHpost:
2705   case AArch64::STRHpre:
2706   case AArch64::STRQpost:
2707   case AArch64::STRQpre:
2708   case AArch64::STRSpost:
2709   case AArch64::STRSpre:
2710   case AArch64::STRWpost:
2711   case AArch64::STRWpre:
2712   case AArch64::STRXpost:
2713   case AArch64::STRXpre:
2714     return 3;
2715   case AArch64::LDPDpost:
2716   case AArch64::LDPDpre:
2717   case AArch64::LDPQpost:
2718   case AArch64::LDPQpre:
2719   case AArch64::LDPSpost:
2720   case AArch64::LDPSpre:
2721   case AArch64::LDPWpost:
2722   case AArch64::LDPWpre:
2723   case AArch64::LDPXpost:
2724   case AArch64::LDPXpre:
2725   case AArch64::STGPpre:
2726   case AArch64::STGPpost:
2727   case AArch64::STPDpost:
2728   case AArch64::STPDpre:
2729   case AArch64::STPQpost:
2730   case AArch64::STPQpre:
2731   case AArch64::STPSpost:
2732   case AArch64::STPSpre:
2733   case AArch64::STPWpost:
2734   case AArch64::STPWpre:
2735   case AArch64::STPXpost:
2736   case AArch64::STPXpre:
2737     return 4;
2738   }
2739 }
2740 
isPairableLdStInst(const MachineInstr & MI)2741 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2742   switch (MI.getOpcode()) {
2743   default:
2744     return false;
2745   // Scaled instructions.
2746   case AArch64::STRSui:
2747   case AArch64::STRDui:
2748   case AArch64::STRQui:
2749   case AArch64::STRXui:
2750   case AArch64::STRWui:
2751   case AArch64::LDRSui:
2752   case AArch64::LDRDui:
2753   case AArch64::LDRQui:
2754   case AArch64::LDRXui:
2755   case AArch64::LDRWui:
2756   case AArch64::LDRSWui:
2757   // Unscaled instructions.
2758   case AArch64::STURSi:
2759   case AArch64::STRSpre:
2760   case AArch64::STURDi:
2761   case AArch64::STRDpre:
2762   case AArch64::STURQi:
2763   case AArch64::STRQpre:
2764   case AArch64::STURWi:
2765   case AArch64::STRWpre:
2766   case AArch64::STURXi:
2767   case AArch64::STRXpre:
2768   case AArch64::LDURSi:
2769   case AArch64::LDRSpre:
2770   case AArch64::LDURDi:
2771   case AArch64::LDRDpre:
2772   case AArch64::LDURQi:
2773   case AArch64::LDRQpre:
2774   case AArch64::LDURWi:
2775   case AArch64::LDRWpre:
2776   case AArch64::LDURXi:
2777   case AArch64::LDRXpre:
2778   case AArch64::LDURSWi:
2779   case AArch64::LDRSWpre:
2780   // SVE instructions.
2781   case AArch64::LDR_ZXI:
2782   case AArch64::STR_ZXI:
2783     return true;
2784   }
2785 }
2786 
isTailCallReturnInst(const MachineInstr & MI)2787 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2788   switch (MI.getOpcode()) {
2789   default:
2790     assert((!MI.isCall() || !MI.isReturn()) &&
2791            "Unexpected instruction - was a new tail call opcode introduced?");
2792     return false;
2793   case AArch64::TCRETURNdi:
2794   case AArch64::TCRETURNri:
2795   case AArch64::TCRETURNrix16x17:
2796   case AArch64::TCRETURNrix17:
2797   case AArch64::TCRETURNrinotx16:
2798   case AArch64::TCRETURNriALL:
2799   case AArch64::AUTH_TCRETURN:
2800   case AArch64::AUTH_TCRETURN_BTI:
2801     return true;
2802   }
2803 }
2804 
convertToFlagSettingOpc(unsigned Opc)2805 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2806   switch (Opc) {
2807   default:
2808     llvm_unreachable("Opcode has no flag setting equivalent!");
2809   // 32-bit cases:
2810   case AArch64::ADDWri:
2811     return AArch64::ADDSWri;
2812   case AArch64::ADDWrr:
2813     return AArch64::ADDSWrr;
2814   case AArch64::ADDWrs:
2815     return AArch64::ADDSWrs;
2816   case AArch64::ADDWrx:
2817     return AArch64::ADDSWrx;
2818   case AArch64::ANDWri:
2819     return AArch64::ANDSWri;
2820   case AArch64::ANDWrr:
2821     return AArch64::ANDSWrr;
2822   case AArch64::ANDWrs:
2823     return AArch64::ANDSWrs;
2824   case AArch64::BICWrr:
2825     return AArch64::BICSWrr;
2826   case AArch64::BICWrs:
2827     return AArch64::BICSWrs;
2828   case AArch64::SUBWri:
2829     return AArch64::SUBSWri;
2830   case AArch64::SUBWrr:
2831     return AArch64::SUBSWrr;
2832   case AArch64::SUBWrs:
2833     return AArch64::SUBSWrs;
2834   case AArch64::SUBWrx:
2835     return AArch64::SUBSWrx;
2836   // 64-bit cases:
2837   case AArch64::ADDXri:
2838     return AArch64::ADDSXri;
2839   case AArch64::ADDXrr:
2840     return AArch64::ADDSXrr;
2841   case AArch64::ADDXrs:
2842     return AArch64::ADDSXrs;
2843   case AArch64::ADDXrx:
2844     return AArch64::ADDSXrx;
2845   case AArch64::ANDXri:
2846     return AArch64::ANDSXri;
2847   case AArch64::ANDXrr:
2848     return AArch64::ANDSXrr;
2849   case AArch64::ANDXrs:
2850     return AArch64::ANDSXrs;
2851   case AArch64::BICXrr:
2852     return AArch64::BICSXrr;
2853   case AArch64::BICXrs:
2854     return AArch64::BICSXrs;
2855   case AArch64::SUBXri:
2856     return AArch64::SUBSXri;
2857   case AArch64::SUBXrr:
2858     return AArch64::SUBSXrr;
2859   case AArch64::SUBXrs:
2860     return AArch64::SUBSXrs;
2861   case AArch64::SUBXrx:
2862     return AArch64::SUBSXrx;
2863   // SVE instructions:
2864   case AArch64::AND_PPzPP:
2865     return AArch64::ANDS_PPzPP;
2866   case AArch64::BIC_PPzPP:
2867     return AArch64::BICS_PPzPP;
2868   case AArch64::EOR_PPzPP:
2869     return AArch64::EORS_PPzPP;
2870   case AArch64::NAND_PPzPP:
2871     return AArch64::NANDS_PPzPP;
2872   case AArch64::NOR_PPzPP:
2873     return AArch64::NORS_PPzPP;
2874   case AArch64::ORN_PPzPP:
2875     return AArch64::ORNS_PPzPP;
2876   case AArch64::ORR_PPzPP:
2877     return AArch64::ORRS_PPzPP;
2878   case AArch64::BRKA_PPzP:
2879     return AArch64::BRKAS_PPzP;
2880   case AArch64::BRKPA_PPzPP:
2881     return AArch64::BRKPAS_PPzPP;
2882   case AArch64::BRKB_PPzP:
2883     return AArch64::BRKBS_PPzP;
2884   case AArch64::BRKPB_PPzPP:
2885     return AArch64::BRKPBS_PPzPP;
2886   case AArch64::BRKN_PPzP:
2887     return AArch64::BRKNS_PPzP;
2888   case AArch64::RDFFR_PPz:
2889     return AArch64::RDFFRS_PPz;
2890   case AArch64::PTRUE_B:
2891     return AArch64::PTRUES_B;
2892   }
2893 }
2894 
2895 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2896 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2897 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2898 
2899   bool IsPreLdSt = isPreLdSt(MI);
2900 
2901   // If this is a volatile load/store, don't mess with it.
2902   if (MI.hasOrderedMemoryRef())
2903     return false;
2904 
2905   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2906   // For Pre-inc LD/ST, the operand is shifted by one.
2907   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2908           MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2909          "Expected a reg or frame index operand.");
2910 
2911   // For Pre-indexed addressing quadword instructions, the third operand is the
2912   // immediate value.
2913   bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2914 
2915   if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2916     return false;
2917 
2918   // Can't merge/pair if the instruction modifies the base register.
2919   // e.g., ldr x0, [x0]
2920   // This case will never occur with an FI base.
2921   // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2922   // STR<S,D,Q,W,X>pre, it can be merged.
2923   // For example:
2924   //   ldr q0, [x11, #32]!
2925   //   ldr q1, [x11, #16]
2926   //   to
2927   //   ldp q0, q1, [x11, #32]!
2928   if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2929     Register BaseReg = MI.getOperand(1).getReg();
2930     const TargetRegisterInfo *TRI = &getRegisterInfo();
2931     if (MI.modifiesRegister(BaseReg, TRI))
2932       return false;
2933   }
2934 
2935   // Pairing SVE fills/spills is only valid for little-endian targets that
2936   // implement VLS 128.
2937   switch (MI.getOpcode()) {
2938   default:
2939     break;
2940   case AArch64::LDR_ZXI:
2941   case AArch64::STR_ZXI:
2942     if (!Subtarget.isLittleEndian() ||
2943         Subtarget.getSVEVectorSizeInBits() != 128)
2944       return false;
2945   }
2946 
2947   // Check if this load/store has a hint to avoid pair formation.
2948   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2949   if (isLdStPairSuppressed(MI))
2950     return false;
2951 
2952   // Do not pair any callee-save store/reload instructions in the
2953   // prologue/epilogue if the CFI information encoded the operations as separate
2954   // instructions, as that will cause the size of the actual prologue to mismatch
2955   // with the prologue size recorded in the Windows CFI.
2956   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2957   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2958                      MI.getMF()->getFunction().needsUnwindTableEntry();
2959   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2960                       MI.getFlag(MachineInstr::FrameDestroy)))
2961     return false;
2962 
2963   // On some CPUs quad load/store pairs are slower than two single load/stores.
2964   if (Subtarget.isPaired128Slow()) {
2965     switch (MI.getOpcode()) {
2966     default:
2967       break;
2968     case AArch64::LDURQi:
2969     case AArch64::STURQi:
2970     case AArch64::LDRQui:
2971     case AArch64::STRQui:
2972       return false;
2973     }
2974   }
2975 
2976   return true;
2977 }
2978 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,LocationSize & Width,const TargetRegisterInfo * TRI) const2979 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2980     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2981     int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2982     const TargetRegisterInfo *TRI) const {
2983   if (!LdSt.mayLoadOrStore())
2984     return false;
2985 
2986   const MachineOperand *BaseOp;
2987   TypeSize WidthN(0, false);
2988   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2989                                     WidthN, TRI))
2990     return false;
2991   // The maximum vscale is 16 under AArch64, return the maximal extent for the
2992   // vector.
2993   Width = LocationSize::precise(WidthN);
2994   BaseOps.push_back(BaseOp);
2995   return true;
2996 }
2997 
2998 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2999 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3000                                           const TargetRegisterInfo *TRI) const {
3001   const MachineOperand *Base; // Filled with the base operand of MI.
3002   int64_t Offset;             // Filled with the offset of MI.
3003   bool OffsetIsScalable;
3004   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3005     return std::nullopt;
3006 
3007   if (!Base->isReg())
3008     return std::nullopt;
3009   ExtAddrMode AM;
3010   AM.BaseReg = Base->getReg();
3011   AM.Displacement = Offset;
3012   AM.ScaledReg = 0;
3013   AM.Scale = 0;
3014   return AM;
3015 }
3016 
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const3017 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3018                                            Register Reg,
3019                                            const MachineInstr &AddrI,
3020                                            ExtAddrMode &AM) const {
3021   // Filter out instructions into which we cannot fold.
3022   unsigned NumBytes;
3023   int64_t OffsetScale = 1;
3024   switch (MemI.getOpcode()) {
3025   default:
3026     return false;
3027 
3028   case AArch64::LDURQi:
3029   case AArch64::STURQi:
3030     NumBytes = 16;
3031     break;
3032 
3033   case AArch64::LDURDi:
3034   case AArch64::STURDi:
3035   case AArch64::LDURXi:
3036   case AArch64::STURXi:
3037     NumBytes = 8;
3038     break;
3039 
3040   case AArch64::LDURWi:
3041   case AArch64::LDURSWi:
3042   case AArch64::STURWi:
3043     NumBytes = 4;
3044     break;
3045 
3046   case AArch64::LDURHi:
3047   case AArch64::STURHi:
3048   case AArch64::LDURHHi:
3049   case AArch64::STURHHi:
3050   case AArch64::LDURSHXi:
3051   case AArch64::LDURSHWi:
3052     NumBytes = 2;
3053     break;
3054 
3055   case AArch64::LDRBroX:
3056   case AArch64::LDRBBroX:
3057   case AArch64::LDRSBXroX:
3058   case AArch64::LDRSBWroX:
3059   case AArch64::STRBroX:
3060   case AArch64::STRBBroX:
3061   case AArch64::LDURBi:
3062   case AArch64::LDURBBi:
3063   case AArch64::LDURSBXi:
3064   case AArch64::LDURSBWi:
3065   case AArch64::STURBi:
3066   case AArch64::STURBBi:
3067   case AArch64::LDRBui:
3068   case AArch64::LDRBBui:
3069   case AArch64::LDRSBXui:
3070   case AArch64::LDRSBWui:
3071   case AArch64::STRBui:
3072   case AArch64::STRBBui:
3073     NumBytes = 1;
3074     break;
3075 
3076   case AArch64::LDRQroX:
3077   case AArch64::STRQroX:
3078   case AArch64::LDRQui:
3079   case AArch64::STRQui:
3080     NumBytes = 16;
3081     OffsetScale = 16;
3082     break;
3083 
3084   case AArch64::LDRDroX:
3085   case AArch64::STRDroX:
3086   case AArch64::LDRXroX:
3087   case AArch64::STRXroX:
3088   case AArch64::LDRDui:
3089   case AArch64::STRDui:
3090   case AArch64::LDRXui:
3091   case AArch64::STRXui:
3092     NumBytes = 8;
3093     OffsetScale = 8;
3094     break;
3095 
3096   case AArch64::LDRWroX:
3097   case AArch64::LDRSWroX:
3098   case AArch64::STRWroX:
3099   case AArch64::LDRWui:
3100   case AArch64::LDRSWui:
3101   case AArch64::STRWui:
3102     NumBytes = 4;
3103     OffsetScale = 4;
3104     break;
3105 
3106   case AArch64::LDRHroX:
3107   case AArch64::STRHroX:
3108   case AArch64::LDRHHroX:
3109   case AArch64::STRHHroX:
3110   case AArch64::LDRSHXroX:
3111   case AArch64::LDRSHWroX:
3112   case AArch64::LDRHui:
3113   case AArch64::STRHui:
3114   case AArch64::LDRHHui:
3115   case AArch64::STRHHui:
3116   case AArch64::LDRSHXui:
3117   case AArch64::LDRSHWui:
3118     NumBytes = 2;
3119     OffsetScale = 2;
3120     break;
3121   }
3122 
3123   // Check the fold operand is not the loaded/stored value.
3124   const MachineOperand &BaseRegOp = MemI.getOperand(0);
3125   if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3126     return false;
3127 
3128   // Handle memory instructions with a [Reg, Reg] addressing mode.
3129   if (MemI.getOperand(2).isReg()) {
3130     // Bail if the addressing mode already includes extension of the offset
3131     // register.
3132     if (MemI.getOperand(3).getImm())
3133       return false;
3134 
3135     // Check if we actually have a scaled offset.
3136     if (MemI.getOperand(4).getImm() == 0)
3137       OffsetScale = 1;
3138 
3139     // If the address instructions is folded into the base register, then the
3140     // addressing mode must not have a scale. Then we can swap the base and the
3141     // scaled registers.
3142     if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3143       return false;
3144 
3145     switch (AddrI.getOpcode()) {
3146     default:
3147       return false;
3148 
3149     case AArch64::SBFMXri:
3150       // sxtw Xa, Wm
3151       // ldr Xd, [Xn, Xa, lsl #N]
3152       // ->
3153       // ldr Xd, [Xn, Wm, sxtw #N]
3154       if (AddrI.getOperand(2).getImm() != 0 ||
3155           AddrI.getOperand(3).getImm() != 31)
3156         return false;
3157 
3158       AM.BaseReg = MemI.getOperand(1).getReg();
3159       if (AM.BaseReg == Reg)
3160         AM.BaseReg = MemI.getOperand(2).getReg();
3161       AM.ScaledReg = AddrI.getOperand(1).getReg();
3162       AM.Scale = OffsetScale;
3163       AM.Displacement = 0;
3164       AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3165       return true;
3166 
3167     case TargetOpcode::SUBREG_TO_REG: {
3168       // mov Wa, Wm
3169       // ldr Xd, [Xn, Xa, lsl #N]
3170       // ->
3171       // ldr Xd, [Xn, Wm, uxtw #N]
3172 
3173       // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3174       if (AddrI.getOperand(1).getImm() != 0 ||
3175           AddrI.getOperand(3).getImm() != AArch64::sub_32)
3176         return false;
3177 
3178       const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3179       Register OffsetReg = AddrI.getOperand(2).getReg();
3180       if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3181         return false;
3182 
3183       const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3184       if (DefMI.getOpcode() != AArch64::ORRWrs ||
3185           DefMI.getOperand(1).getReg() != AArch64::WZR ||
3186           DefMI.getOperand(3).getImm() != 0)
3187         return false;
3188 
3189       AM.BaseReg = MemI.getOperand(1).getReg();
3190       if (AM.BaseReg == Reg)
3191         AM.BaseReg = MemI.getOperand(2).getReg();
3192       AM.ScaledReg = DefMI.getOperand(2).getReg();
3193       AM.Scale = OffsetScale;
3194       AM.Displacement = 0;
3195       AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3196       return true;
3197     }
3198     }
3199   }
3200 
3201   // Handle memory instructions with a [Reg, #Imm] addressing mode.
3202 
3203   // Check we are not breaking a potential conversion to an LDP.
3204   auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3205                                  int64_t NewOffset) -> bool {
3206     int64_t MinOffset, MaxOffset;
3207     switch (NumBytes) {
3208     default:
3209       return true;
3210     case 4:
3211       MinOffset = -256;
3212       MaxOffset = 252;
3213       break;
3214     case 8:
3215       MinOffset = -512;
3216       MaxOffset = 504;
3217       break;
3218     case 16:
3219       MinOffset = -1024;
3220       MaxOffset = 1008;
3221       break;
3222     }
3223     return OldOffset < MinOffset || OldOffset > MaxOffset ||
3224            (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3225   };
3226   auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3227     int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3228     int64_t NewOffset = OldOffset + Disp;
3229     if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3230       return false;
3231     // If the old offset would fit into an LDP, but the new offset wouldn't,
3232     // bail out.
3233     if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3234       return false;
3235     AM.BaseReg = AddrI.getOperand(1).getReg();
3236     AM.ScaledReg = 0;
3237     AM.Scale = 0;
3238     AM.Displacement = NewOffset;
3239     AM.Form = ExtAddrMode::Formula::Basic;
3240     return true;
3241   };
3242 
3243   auto canFoldAddRegIntoAddrMode =
3244       [&](int64_t Scale,
3245           ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3246     if (MemI.getOperand(2).getImm() != 0)
3247       return false;
3248     if ((unsigned)Scale != Scale)
3249       return false;
3250     if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3251       return false;
3252     AM.BaseReg = AddrI.getOperand(1).getReg();
3253     AM.ScaledReg = AddrI.getOperand(2).getReg();
3254     AM.Scale = Scale;
3255     AM.Displacement = 0;
3256     AM.Form = Form;
3257     return true;
3258   };
3259 
3260   auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3261     unsigned Opcode = MemI.getOpcode();
3262     return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3263            Subtarget.isSTRQroSlow();
3264   };
3265 
3266   int64_t Disp = 0;
3267   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3268   switch (AddrI.getOpcode()) {
3269   default:
3270     return false;
3271 
3272   case AArch64::ADDXri:
3273     // add Xa, Xn, #N
3274     // ldr Xd, [Xa, #M]
3275     // ->
3276     // ldr Xd, [Xn, #N'+M]
3277     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3278     return canFoldAddSubImmIntoAddrMode(Disp);
3279 
3280   case AArch64::SUBXri:
3281     // sub Xa, Xn, #N
3282     // ldr Xd, [Xa, #M]
3283     // ->
3284     // ldr Xd, [Xn, #N'+M]
3285     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3286     return canFoldAddSubImmIntoAddrMode(-Disp);
3287 
3288   case AArch64::ADDXrs: {
3289     // add Xa, Xn, Xm, lsl #N
3290     // ldr Xd, [Xa]
3291     // ->
3292     // ldr Xd, [Xn, Xm, lsl #N]
3293 
3294     // Don't fold the add if the result would be slower, unless optimising for
3295     // size.
3296     unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3297     if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3298       return false;
3299     Shift = AArch64_AM::getShiftValue(Shift);
3300     if (!OptSize) {
3301       if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3302         return false;
3303       if (avoidSlowSTRQ(MemI))
3304         return false;
3305     }
3306     return canFoldAddRegIntoAddrMode(1ULL << Shift);
3307   }
3308 
3309   case AArch64::ADDXrr:
3310     // add Xa, Xn, Xm
3311     // ldr Xd, [Xa]
3312     // ->
3313     // ldr Xd, [Xn, Xm, lsl #0]
3314 
3315     // Don't fold the add if the result would be slower, unless optimising for
3316     // size.
3317     if (!OptSize && avoidSlowSTRQ(MemI))
3318       return false;
3319     return canFoldAddRegIntoAddrMode(1);
3320 
3321   case AArch64::ADDXrx:
3322     // add Xa, Xn, Wm, {s,u}xtw #N
3323     // ldr Xd, [Xa]
3324     // ->
3325     // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3326 
3327     // Don't fold the add if the result would be slower, unless optimising for
3328     // size.
3329     if (!OptSize && avoidSlowSTRQ(MemI))
3330       return false;
3331 
3332     // Can fold only sign-/zero-extend of a word.
3333     unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3334     AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3335     if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3336       return false;
3337 
3338     return canFoldAddRegIntoAddrMode(
3339         1ULL << AArch64_AM::getArithShiftValue(Imm),
3340         (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3341                                      : ExtAddrMode::Formula::ZExtScaledReg);
3342   }
3343 }
3344 
3345 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3346 // return the opcode of an instruction performing the same operation, but using
3347 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3348 static unsigned regOffsetOpcode(unsigned Opcode) {
3349   switch (Opcode) {
3350   default:
3351     llvm_unreachable("Address folding not implemented for instruction");
3352 
3353   case AArch64::LDURQi:
3354   case AArch64::LDRQui:
3355     return AArch64::LDRQroX;
3356   case AArch64::STURQi:
3357   case AArch64::STRQui:
3358     return AArch64::STRQroX;
3359   case AArch64::LDURDi:
3360   case AArch64::LDRDui:
3361     return AArch64::LDRDroX;
3362   case AArch64::STURDi:
3363   case AArch64::STRDui:
3364     return AArch64::STRDroX;
3365   case AArch64::LDURXi:
3366   case AArch64::LDRXui:
3367     return AArch64::LDRXroX;
3368   case AArch64::STURXi:
3369   case AArch64::STRXui:
3370     return AArch64::STRXroX;
3371   case AArch64::LDURWi:
3372   case AArch64::LDRWui:
3373     return AArch64::LDRWroX;
3374   case AArch64::LDURSWi:
3375   case AArch64::LDRSWui:
3376     return AArch64::LDRSWroX;
3377   case AArch64::STURWi:
3378   case AArch64::STRWui:
3379     return AArch64::STRWroX;
3380   case AArch64::LDURHi:
3381   case AArch64::LDRHui:
3382     return AArch64::LDRHroX;
3383   case AArch64::STURHi:
3384   case AArch64::STRHui:
3385     return AArch64::STRHroX;
3386   case AArch64::LDURHHi:
3387   case AArch64::LDRHHui:
3388     return AArch64::LDRHHroX;
3389   case AArch64::STURHHi:
3390   case AArch64::STRHHui:
3391     return AArch64::STRHHroX;
3392   case AArch64::LDURSHXi:
3393   case AArch64::LDRSHXui:
3394     return AArch64::LDRSHXroX;
3395   case AArch64::LDURSHWi:
3396   case AArch64::LDRSHWui:
3397     return AArch64::LDRSHWroX;
3398   case AArch64::LDURBi:
3399   case AArch64::LDRBui:
3400     return AArch64::LDRBroX;
3401   case AArch64::LDURBBi:
3402   case AArch64::LDRBBui:
3403     return AArch64::LDRBBroX;
3404   case AArch64::LDURSBXi:
3405   case AArch64::LDRSBXui:
3406     return AArch64::LDRSBXroX;
3407   case AArch64::LDURSBWi:
3408   case AArch64::LDRSBWui:
3409     return AArch64::LDRSBWroX;
3410   case AArch64::STURBi:
3411   case AArch64::STRBui:
3412     return AArch64::STRBroX;
3413   case AArch64::STURBBi:
3414   case AArch64::STRBBui:
3415     return AArch64::STRBBroX;
3416   }
3417 }
3418 
3419 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3420 // the opcode of an instruction performing the same operation, but using the
3421 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3422 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3423   switch (Opcode) {
3424   default:
3425     llvm_unreachable("Address folding not implemented for instruction");
3426 
3427   case AArch64::LDURQi:
3428     Scale = 16;
3429     return AArch64::LDRQui;
3430   case AArch64::STURQi:
3431     Scale = 16;
3432     return AArch64::STRQui;
3433   case AArch64::LDURDi:
3434     Scale = 8;
3435     return AArch64::LDRDui;
3436   case AArch64::STURDi:
3437     Scale = 8;
3438     return AArch64::STRDui;
3439   case AArch64::LDURXi:
3440     Scale = 8;
3441     return AArch64::LDRXui;
3442   case AArch64::STURXi:
3443     Scale = 8;
3444     return AArch64::STRXui;
3445   case AArch64::LDURWi:
3446     Scale = 4;
3447     return AArch64::LDRWui;
3448   case AArch64::LDURSWi:
3449     Scale = 4;
3450     return AArch64::LDRSWui;
3451   case AArch64::STURWi:
3452     Scale = 4;
3453     return AArch64::STRWui;
3454   case AArch64::LDURHi:
3455     Scale = 2;
3456     return AArch64::LDRHui;
3457   case AArch64::STURHi:
3458     Scale = 2;
3459     return AArch64::STRHui;
3460   case AArch64::LDURHHi:
3461     Scale = 2;
3462     return AArch64::LDRHHui;
3463   case AArch64::STURHHi:
3464     Scale = 2;
3465     return AArch64::STRHHui;
3466   case AArch64::LDURSHXi:
3467     Scale = 2;
3468     return AArch64::LDRSHXui;
3469   case AArch64::LDURSHWi:
3470     Scale = 2;
3471     return AArch64::LDRSHWui;
3472   case AArch64::LDURBi:
3473     Scale = 1;
3474     return AArch64::LDRBui;
3475   case AArch64::LDURBBi:
3476     Scale = 1;
3477     return AArch64::LDRBBui;
3478   case AArch64::LDURSBXi:
3479     Scale = 1;
3480     return AArch64::LDRSBXui;
3481   case AArch64::LDURSBWi:
3482     Scale = 1;
3483     return AArch64::LDRSBWui;
3484   case AArch64::STURBi:
3485     Scale = 1;
3486     return AArch64::STRBui;
3487   case AArch64::STURBBi:
3488     Scale = 1;
3489     return AArch64::STRBBui;
3490   case AArch64::LDRQui:
3491   case AArch64::STRQui:
3492     Scale = 16;
3493     return Opcode;
3494   case AArch64::LDRDui:
3495   case AArch64::STRDui:
3496   case AArch64::LDRXui:
3497   case AArch64::STRXui:
3498     Scale = 8;
3499     return Opcode;
3500   case AArch64::LDRWui:
3501   case AArch64::LDRSWui:
3502   case AArch64::STRWui:
3503     Scale = 4;
3504     return Opcode;
3505   case AArch64::LDRHui:
3506   case AArch64::STRHui:
3507   case AArch64::LDRHHui:
3508   case AArch64::STRHHui:
3509   case AArch64::LDRSHXui:
3510   case AArch64::LDRSHWui:
3511     Scale = 2;
3512     return Opcode;
3513   case AArch64::LDRBui:
3514   case AArch64::LDRBBui:
3515   case AArch64::LDRSBXui:
3516   case AArch64::LDRSBWui:
3517   case AArch64::STRBui:
3518   case AArch64::STRBBui:
3519     Scale = 1;
3520     return Opcode;
3521   }
3522 }
3523 
3524 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3525 // the opcode of an instruction performing the same operation, but using the
3526 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3527 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3528   switch (Opcode) {
3529   default:
3530     llvm_unreachable("Address folding not implemented for instruction");
3531 
3532   case AArch64::LDURQi:
3533   case AArch64::STURQi:
3534   case AArch64::LDURDi:
3535   case AArch64::STURDi:
3536   case AArch64::LDURXi:
3537   case AArch64::STURXi:
3538   case AArch64::LDURWi:
3539   case AArch64::LDURSWi:
3540   case AArch64::STURWi:
3541   case AArch64::LDURHi:
3542   case AArch64::STURHi:
3543   case AArch64::LDURHHi:
3544   case AArch64::STURHHi:
3545   case AArch64::LDURSHXi:
3546   case AArch64::LDURSHWi:
3547   case AArch64::LDURBi:
3548   case AArch64::STURBi:
3549   case AArch64::LDURBBi:
3550   case AArch64::STURBBi:
3551   case AArch64::LDURSBWi:
3552   case AArch64::LDURSBXi:
3553     return Opcode;
3554   case AArch64::LDRQui:
3555     return AArch64::LDURQi;
3556   case AArch64::STRQui:
3557     return AArch64::STURQi;
3558   case AArch64::LDRDui:
3559     return AArch64::LDURDi;
3560   case AArch64::STRDui:
3561     return AArch64::STURDi;
3562   case AArch64::LDRXui:
3563     return AArch64::LDURXi;
3564   case AArch64::STRXui:
3565     return AArch64::STURXi;
3566   case AArch64::LDRWui:
3567     return AArch64::LDURWi;
3568   case AArch64::LDRSWui:
3569     return AArch64::LDURSWi;
3570   case AArch64::STRWui:
3571     return AArch64::STURWi;
3572   case AArch64::LDRHui:
3573     return AArch64::LDURHi;
3574   case AArch64::STRHui:
3575     return AArch64::STURHi;
3576   case AArch64::LDRHHui:
3577     return AArch64::LDURHHi;
3578   case AArch64::STRHHui:
3579     return AArch64::STURHHi;
3580   case AArch64::LDRSHXui:
3581     return AArch64::LDURSHXi;
3582   case AArch64::LDRSHWui:
3583     return AArch64::LDURSHWi;
3584   case AArch64::LDRBBui:
3585     return AArch64::LDURBBi;
3586   case AArch64::LDRBui:
3587     return AArch64::LDURBi;
3588   case AArch64::STRBBui:
3589     return AArch64::STURBBi;
3590   case AArch64::STRBui:
3591     return AArch64::STURBi;
3592   case AArch64::LDRSBWui:
3593     return AArch64::LDURSBWi;
3594   case AArch64::LDRSBXui:
3595     return AArch64::LDURSBXi;
3596   }
3597 }
3598 
3599 // Given the opcode of a memory load/store instruction, return the opcode of an
3600 // instruction performing the same operation, but using
3601 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3602 // offset register.
offsetExtendOpcode(unsigned Opcode)3603 static unsigned offsetExtendOpcode(unsigned Opcode) {
3604   switch (Opcode) {
3605   default:
3606     llvm_unreachable("Address folding not implemented for instruction");
3607 
3608   case AArch64::LDRQroX:
3609   case AArch64::LDURQi:
3610   case AArch64::LDRQui:
3611     return AArch64::LDRQroW;
3612   case AArch64::STRQroX:
3613   case AArch64::STURQi:
3614   case AArch64::STRQui:
3615     return AArch64::STRQroW;
3616   case AArch64::LDRDroX:
3617   case AArch64::LDURDi:
3618   case AArch64::LDRDui:
3619     return AArch64::LDRDroW;
3620   case AArch64::STRDroX:
3621   case AArch64::STURDi:
3622   case AArch64::STRDui:
3623     return AArch64::STRDroW;
3624   case AArch64::LDRXroX:
3625   case AArch64::LDURXi:
3626   case AArch64::LDRXui:
3627     return AArch64::LDRXroW;
3628   case AArch64::STRXroX:
3629   case AArch64::STURXi:
3630   case AArch64::STRXui:
3631     return AArch64::STRXroW;
3632   case AArch64::LDRWroX:
3633   case AArch64::LDURWi:
3634   case AArch64::LDRWui:
3635     return AArch64::LDRWroW;
3636   case AArch64::LDRSWroX:
3637   case AArch64::LDURSWi:
3638   case AArch64::LDRSWui:
3639     return AArch64::LDRSWroW;
3640   case AArch64::STRWroX:
3641   case AArch64::STURWi:
3642   case AArch64::STRWui:
3643     return AArch64::STRWroW;
3644   case AArch64::LDRHroX:
3645   case AArch64::LDURHi:
3646   case AArch64::LDRHui:
3647     return AArch64::LDRHroW;
3648   case AArch64::STRHroX:
3649   case AArch64::STURHi:
3650   case AArch64::STRHui:
3651     return AArch64::STRHroW;
3652   case AArch64::LDRHHroX:
3653   case AArch64::LDURHHi:
3654   case AArch64::LDRHHui:
3655     return AArch64::LDRHHroW;
3656   case AArch64::STRHHroX:
3657   case AArch64::STURHHi:
3658   case AArch64::STRHHui:
3659     return AArch64::STRHHroW;
3660   case AArch64::LDRSHXroX:
3661   case AArch64::LDURSHXi:
3662   case AArch64::LDRSHXui:
3663     return AArch64::LDRSHXroW;
3664   case AArch64::LDRSHWroX:
3665   case AArch64::LDURSHWi:
3666   case AArch64::LDRSHWui:
3667     return AArch64::LDRSHWroW;
3668   case AArch64::LDRBroX:
3669   case AArch64::LDURBi:
3670   case AArch64::LDRBui:
3671     return AArch64::LDRBroW;
3672   case AArch64::LDRBBroX:
3673   case AArch64::LDURBBi:
3674   case AArch64::LDRBBui:
3675     return AArch64::LDRBBroW;
3676   case AArch64::LDRSBXroX:
3677   case AArch64::LDURSBXi:
3678   case AArch64::LDRSBXui:
3679     return AArch64::LDRSBXroW;
3680   case AArch64::LDRSBWroX:
3681   case AArch64::LDURSBWi:
3682   case AArch64::LDRSBWui:
3683     return AArch64::LDRSBWroW;
3684   case AArch64::STRBroX:
3685   case AArch64::STURBi:
3686   case AArch64::STRBui:
3687     return AArch64::STRBroW;
3688   case AArch64::STRBBroX:
3689   case AArch64::STURBBi:
3690   case AArch64::STRBBui:
3691     return AArch64::STRBBroW;
3692   }
3693 }
3694 
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3695 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3696                                                  const ExtAddrMode &AM) const {
3697 
3698   const DebugLoc &DL = MemI.getDebugLoc();
3699   MachineBasicBlock &MBB = *MemI.getParent();
3700   MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3701 
3702   if (AM.Form == ExtAddrMode::Formula::Basic) {
3703     if (AM.ScaledReg) {
3704       // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3705       unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3706       MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3707       auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3708                    .addReg(MemI.getOperand(0).getReg(),
3709                            MemI.mayLoad() ? RegState::Define : 0)
3710                    .addReg(AM.BaseReg)
3711                    .addReg(AM.ScaledReg)
3712                    .addImm(0)
3713                    .addImm(AM.Scale > 1)
3714                    .setMemRefs(MemI.memoperands())
3715                    .setMIFlags(MemI.getFlags());
3716       return B.getInstr();
3717     }
3718 
3719     assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3720            "Addressing mode not supported for folding");
3721 
3722     // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3723     unsigned Scale = 1;
3724     unsigned Opcode = MemI.getOpcode();
3725     if (isInt<9>(AM.Displacement))
3726       Opcode = unscaledOffsetOpcode(Opcode);
3727     else
3728       Opcode = scaledOffsetOpcode(Opcode, Scale);
3729 
3730     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3731                  .addReg(MemI.getOperand(0).getReg(),
3732                          MemI.mayLoad() ? RegState::Define : 0)
3733                  .addReg(AM.BaseReg)
3734                  .addImm(AM.Displacement / Scale)
3735                  .setMemRefs(MemI.memoperands())
3736                  .setMIFlags(MemI.getFlags());
3737     return B.getInstr();
3738   }
3739 
3740   if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3741       AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3742     // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3743     assert(AM.ScaledReg && !AM.Displacement &&
3744            "Address offset can be a register or an immediate, but not both");
3745     unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3746     MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3747     // Make sure the offset register is in the correct register class.
3748     Register OffsetReg = AM.ScaledReg;
3749     const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3750     if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3751       OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3752       BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3753           .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3754     }
3755     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3756                  .addReg(MemI.getOperand(0).getReg(),
3757                          MemI.mayLoad() ? RegState::Define : 0)
3758                  .addReg(AM.BaseReg)
3759                  .addReg(OffsetReg)
3760                  .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3761                  .addImm(AM.Scale != 1)
3762                  .setMemRefs(MemI.memoperands())
3763                  .setMIFlags(MemI.getFlags());
3764 
3765     return B.getInstr();
3766   }
3767 
3768   llvm_unreachable(
3769       "Function must not be called with an addressing mode it can't handle");
3770 }
3771 
3772 /// Return true if the opcode is a post-index ld/st instruction, which really
3773 /// loads from base+0.
isPostIndexLdStOpcode(unsigned Opcode)3774 static bool isPostIndexLdStOpcode(unsigned Opcode) {
3775   switch (Opcode) {
3776   default:
3777     return false;
3778   case AArch64::LD1Fourv16b_POST:
3779   case AArch64::LD1Fourv1d_POST:
3780   case AArch64::LD1Fourv2d_POST:
3781   case AArch64::LD1Fourv2s_POST:
3782   case AArch64::LD1Fourv4h_POST:
3783   case AArch64::LD1Fourv4s_POST:
3784   case AArch64::LD1Fourv8b_POST:
3785   case AArch64::LD1Fourv8h_POST:
3786   case AArch64::LD1Onev16b_POST:
3787   case AArch64::LD1Onev1d_POST:
3788   case AArch64::LD1Onev2d_POST:
3789   case AArch64::LD1Onev2s_POST:
3790   case AArch64::LD1Onev4h_POST:
3791   case AArch64::LD1Onev4s_POST:
3792   case AArch64::LD1Onev8b_POST:
3793   case AArch64::LD1Onev8h_POST:
3794   case AArch64::LD1Rv16b_POST:
3795   case AArch64::LD1Rv1d_POST:
3796   case AArch64::LD1Rv2d_POST:
3797   case AArch64::LD1Rv2s_POST:
3798   case AArch64::LD1Rv4h_POST:
3799   case AArch64::LD1Rv4s_POST:
3800   case AArch64::LD1Rv8b_POST:
3801   case AArch64::LD1Rv8h_POST:
3802   case AArch64::LD1Threev16b_POST:
3803   case AArch64::LD1Threev1d_POST:
3804   case AArch64::LD1Threev2d_POST:
3805   case AArch64::LD1Threev2s_POST:
3806   case AArch64::LD1Threev4h_POST:
3807   case AArch64::LD1Threev4s_POST:
3808   case AArch64::LD1Threev8b_POST:
3809   case AArch64::LD1Threev8h_POST:
3810   case AArch64::LD1Twov16b_POST:
3811   case AArch64::LD1Twov1d_POST:
3812   case AArch64::LD1Twov2d_POST:
3813   case AArch64::LD1Twov2s_POST:
3814   case AArch64::LD1Twov4h_POST:
3815   case AArch64::LD1Twov4s_POST:
3816   case AArch64::LD1Twov8b_POST:
3817   case AArch64::LD1Twov8h_POST:
3818   case AArch64::LD1i16_POST:
3819   case AArch64::LD1i32_POST:
3820   case AArch64::LD1i64_POST:
3821   case AArch64::LD1i8_POST:
3822   case AArch64::LD2Rv16b_POST:
3823   case AArch64::LD2Rv1d_POST:
3824   case AArch64::LD2Rv2d_POST:
3825   case AArch64::LD2Rv2s_POST:
3826   case AArch64::LD2Rv4h_POST:
3827   case AArch64::LD2Rv4s_POST:
3828   case AArch64::LD2Rv8b_POST:
3829   case AArch64::LD2Rv8h_POST:
3830   case AArch64::LD2Twov16b_POST:
3831   case AArch64::LD2Twov2d_POST:
3832   case AArch64::LD2Twov2s_POST:
3833   case AArch64::LD2Twov4h_POST:
3834   case AArch64::LD2Twov4s_POST:
3835   case AArch64::LD2Twov8b_POST:
3836   case AArch64::LD2Twov8h_POST:
3837   case AArch64::LD2i16_POST:
3838   case AArch64::LD2i32_POST:
3839   case AArch64::LD2i64_POST:
3840   case AArch64::LD2i8_POST:
3841   case AArch64::LD3Rv16b_POST:
3842   case AArch64::LD3Rv1d_POST:
3843   case AArch64::LD3Rv2d_POST:
3844   case AArch64::LD3Rv2s_POST:
3845   case AArch64::LD3Rv4h_POST:
3846   case AArch64::LD3Rv4s_POST:
3847   case AArch64::LD3Rv8b_POST:
3848   case AArch64::LD3Rv8h_POST:
3849   case AArch64::LD3Threev16b_POST:
3850   case AArch64::LD3Threev2d_POST:
3851   case AArch64::LD3Threev2s_POST:
3852   case AArch64::LD3Threev4h_POST:
3853   case AArch64::LD3Threev4s_POST:
3854   case AArch64::LD3Threev8b_POST:
3855   case AArch64::LD3Threev8h_POST:
3856   case AArch64::LD3i16_POST:
3857   case AArch64::LD3i32_POST:
3858   case AArch64::LD3i64_POST:
3859   case AArch64::LD3i8_POST:
3860   case AArch64::LD4Fourv16b_POST:
3861   case AArch64::LD4Fourv2d_POST:
3862   case AArch64::LD4Fourv2s_POST:
3863   case AArch64::LD4Fourv4h_POST:
3864   case AArch64::LD4Fourv4s_POST:
3865   case AArch64::LD4Fourv8b_POST:
3866   case AArch64::LD4Fourv8h_POST:
3867   case AArch64::LD4Rv16b_POST:
3868   case AArch64::LD4Rv1d_POST:
3869   case AArch64::LD4Rv2d_POST:
3870   case AArch64::LD4Rv2s_POST:
3871   case AArch64::LD4Rv4h_POST:
3872   case AArch64::LD4Rv4s_POST:
3873   case AArch64::LD4Rv8b_POST:
3874   case AArch64::LD4Rv8h_POST:
3875   case AArch64::LD4i16_POST:
3876   case AArch64::LD4i32_POST:
3877   case AArch64::LD4i64_POST:
3878   case AArch64::LD4i8_POST:
3879   case AArch64::LDAPRWpost:
3880   case AArch64::LDAPRXpost:
3881   case AArch64::LDIAPPWpost:
3882   case AArch64::LDIAPPXpost:
3883   case AArch64::LDPDpost:
3884   case AArch64::LDPQpost:
3885   case AArch64::LDPSWpost:
3886   case AArch64::LDPSpost:
3887   case AArch64::LDPWpost:
3888   case AArch64::LDPXpost:
3889   case AArch64::LDRBBpost:
3890   case AArch64::LDRBpost:
3891   case AArch64::LDRDpost:
3892   case AArch64::LDRHHpost:
3893   case AArch64::LDRHpost:
3894   case AArch64::LDRQpost:
3895   case AArch64::LDRSBWpost:
3896   case AArch64::LDRSBXpost:
3897   case AArch64::LDRSHWpost:
3898   case AArch64::LDRSHXpost:
3899   case AArch64::LDRSWpost:
3900   case AArch64::LDRSpost:
3901   case AArch64::LDRWpost:
3902   case AArch64::LDRXpost:
3903   case AArch64::ST1Fourv16b_POST:
3904   case AArch64::ST1Fourv1d_POST:
3905   case AArch64::ST1Fourv2d_POST:
3906   case AArch64::ST1Fourv2s_POST:
3907   case AArch64::ST1Fourv4h_POST:
3908   case AArch64::ST1Fourv4s_POST:
3909   case AArch64::ST1Fourv8b_POST:
3910   case AArch64::ST1Fourv8h_POST:
3911   case AArch64::ST1Onev16b_POST:
3912   case AArch64::ST1Onev1d_POST:
3913   case AArch64::ST1Onev2d_POST:
3914   case AArch64::ST1Onev2s_POST:
3915   case AArch64::ST1Onev4h_POST:
3916   case AArch64::ST1Onev4s_POST:
3917   case AArch64::ST1Onev8b_POST:
3918   case AArch64::ST1Onev8h_POST:
3919   case AArch64::ST1Threev16b_POST:
3920   case AArch64::ST1Threev1d_POST:
3921   case AArch64::ST1Threev2d_POST:
3922   case AArch64::ST1Threev2s_POST:
3923   case AArch64::ST1Threev4h_POST:
3924   case AArch64::ST1Threev4s_POST:
3925   case AArch64::ST1Threev8b_POST:
3926   case AArch64::ST1Threev8h_POST:
3927   case AArch64::ST1Twov16b_POST:
3928   case AArch64::ST1Twov1d_POST:
3929   case AArch64::ST1Twov2d_POST:
3930   case AArch64::ST1Twov2s_POST:
3931   case AArch64::ST1Twov4h_POST:
3932   case AArch64::ST1Twov4s_POST:
3933   case AArch64::ST1Twov8b_POST:
3934   case AArch64::ST1Twov8h_POST:
3935   case AArch64::ST1i16_POST:
3936   case AArch64::ST1i32_POST:
3937   case AArch64::ST1i64_POST:
3938   case AArch64::ST1i8_POST:
3939   case AArch64::ST2GPostIndex:
3940   case AArch64::ST2Twov16b_POST:
3941   case AArch64::ST2Twov2d_POST:
3942   case AArch64::ST2Twov2s_POST:
3943   case AArch64::ST2Twov4h_POST:
3944   case AArch64::ST2Twov4s_POST:
3945   case AArch64::ST2Twov8b_POST:
3946   case AArch64::ST2Twov8h_POST:
3947   case AArch64::ST2i16_POST:
3948   case AArch64::ST2i32_POST:
3949   case AArch64::ST2i64_POST:
3950   case AArch64::ST2i8_POST:
3951   case AArch64::ST3Threev16b_POST:
3952   case AArch64::ST3Threev2d_POST:
3953   case AArch64::ST3Threev2s_POST:
3954   case AArch64::ST3Threev4h_POST:
3955   case AArch64::ST3Threev4s_POST:
3956   case AArch64::ST3Threev8b_POST:
3957   case AArch64::ST3Threev8h_POST:
3958   case AArch64::ST3i16_POST:
3959   case AArch64::ST3i32_POST:
3960   case AArch64::ST3i64_POST:
3961   case AArch64::ST3i8_POST:
3962   case AArch64::ST4Fourv16b_POST:
3963   case AArch64::ST4Fourv2d_POST:
3964   case AArch64::ST4Fourv2s_POST:
3965   case AArch64::ST4Fourv4h_POST:
3966   case AArch64::ST4Fourv4s_POST:
3967   case AArch64::ST4Fourv8b_POST:
3968   case AArch64::ST4Fourv8h_POST:
3969   case AArch64::ST4i16_POST:
3970   case AArch64::ST4i32_POST:
3971   case AArch64::ST4i64_POST:
3972   case AArch64::ST4i8_POST:
3973   case AArch64::STGPostIndex:
3974   case AArch64::STGPpost:
3975   case AArch64::STPDpost:
3976   case AArch64::STPQpost:
3977   case AArch64::STPSpost:
3978   case AArch64::STPWpost:
3979   case AArch64::STPXpost:
3980   case AArch64::STRBBpost:
3981   case AArch64::STRBpost:
3982   case AArch64::STRDpost:
3983   case AArch64::STRHHpost:
3984   case AArch64::STRHpost:
3985   case AArch64::STRQpost:
3986   case AArch64::STRSpost:
3987   case AArch64::STRWpost:
3988   case AArch64::STRXpost:
3989   case AArch64::STZ2GPostIndex:
3990   case AArch64::STZGPostIndex:
3991     return true;
3992   }
3993 }
3994 
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3995 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3996     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3997     bool &OffsetIsScalable, TypeSize &Width,
3998     const TargetRegisterInfo *TRI) const {
3999   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4000   // Handle only loads/stores with base register followed by immediate offset.
4001   if (LdSt.getNumExplicitOperands() == 3) {
4002     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4003     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4004         !LdSt.getOperand(2).isImm())
4005       return false;
4006   } else if (LdSt.getNumExplicitOperands() == 4) {
4007     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4008     if (!LdSt.getOperand(1).isReg() ||
4009         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4010         !LdSt.getOperand(3).isImm())
4011       return false;
4012   } else
4013     return false;
4014 
4015   // Get the scaling factor for the instruction and set the width for the
4016   // instruction.
4017   TypeSize Scale(0U, false);
4018   int64_t Dummy1, Dummy2;
4019 
4020   // If this returns false, then it's an instruction we don't want to handle.
4021   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4022     return false;
4023 
4024   // Compute the offset. Offset is calculated as the immediate operand
4025   // multiplied by the scaling factor. Unscaled instructions have scaling factor
4026   // set to 1. Postindex are a special case which have an offset of 0.
4027   if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4028     BaseOp = &LdSt.getOperand(2);
4029     Offset = 0;
4030   } else if (LdSt.getNumExplicitOperands() == 3) {
4031     BaseOp = &LdSt.getOperand(1);
4032     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4033   } else {
4034     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4035     BaseOp = &LdSt.getOperand(2);
4036     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4037   }
4038   OffsetIsScalable = Scale.isScalable();
4039 
4040   return BaseOp->isReg() || BaseOp->isFI();
4041 }
4042 
4043 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const4044 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4045   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4046   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4047   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4048   return OfsOp;
4049 }
4050 
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)4051 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4052                                     TypeSize &Width, int64_t &MinOffset,
4053                                     int64_t &MaxOffset) {
4054   switch (Opcode) {
4055   // Not a memory operation or something we want to handle.
4056   default:
4057     Scale = TypeSize::getFixed(0);
4058     Width = TypeSize::getFixed(0);
4059     MinOffset = MaxOffset = 0;
4060     return false;
4061   // LDR / STR
4062   case AArch64::LDRQui:
4063   case AArch64::STRQui:
4064     Scale = TypeSize::getFixed(16);
4065     Width = TypeSize::getFixed(16);
4066     MinOffset = 0;
4067     MaxOffset = 4095;
4068     break;
4069   case AArch64::LDRXui:
4070   case AArch64::LDRDui:
4071   case AArch64::STRXui:
4072   case AArch64::STRDui:
4073   case AArch64::PRFMui:
4074     Scale = TypeSize::getFixed(8);
4075     Width = TypeSize::getFixed(8);
4076     MinOffset = 0;
4077     MaxOffset = 4095;
4078     break;
4079   case AArch64::LDRWui:
4080   case AArch64::LDRSui:
4081   case AArch64::LDRSWui:
4082   case AArch64::STRWui:
4083   case AArch64::STRSui:
4084     Scale = TypeSize::getFixed(4);
4085     Width = TypeSize::getFixed(4);
4086     MinOffset = 0;
4087     MaxOffset = 4095;
4088     break;
4089   case AArch64::LDRHui:
4090   case AArch64::LDRHHui:
4091   case AArch64::LDRSHWui:
4092   case AArch64::LDRSHXui:
4093   case AArch64::STRHui:
4094   case AArch64::STRHHui:
4095     Scale = TypeSize::getFixed(2);
4096     Width = TypeSize::getFixed(2);
4097     MinOffset = 0;
4098     MaxOffset = 4095;
4099     break;
4100   case AArch64::LDRBui:
4101   case AArch64::LDRBBui:
4102   case AArch64::LDRSBWui:
4103   case AArch64::LDRSBXui:
4104   case AArch64::STRBui:
4105   case AArch64::STRBBui:
4106     Scale = TypeSize::getFixed(1);
4107     Width = TypeSize::getFixed(1);
4108     MinOffset = 0;
4109     MaxOffset = 4095;
4110     break;
4111   // post/pre inc
4112   case AArch64::STRQpre:
4113   case AArch64::LDRQpost:
4114     Scale = TypeSize::getFixed(1);
4115     Width = TypeSize::getFixed(16);
4116     MinOffset = -256;
4117     MaxOffset = 255;
4118     break;
4119   case AArch64::LDRDpost:
4120   case AArch64::LDRDpre:
4121   case AArch64::LDRXpost:
4122   case AArch64::LDRXpre:
4123   case AArch64::STRDpost:
4124   case AArch64::STRDpre:
4125   case AArch64::STRXpost:
4126   case AArch64::STRXpre:
4127     Scale = TypeSize::getFixed(1);
4128     Width = TypeSize::getFixed(8);
4129     MinOffset = -256;
4130     MaxOffset = 255;
4131     break;
4132   case AArch64::STRWpost:
4133   case AArch64::STRWpre:
4134   case AArch64::LDRWpost:
4135   case AArch64::LDRWpre:
4136   case AArch64::STRSpost:
4137   case AArch64::STRSpre:
4138   case AArch64::LDRSpost:
4139   case AArch64::LDRSpre:
4140     Scale = TypeSize::getFixed(1);
4141     Width = TypeSize::getFixed(4);
4142     MinOffset = -256;
4143     MaxOffset = 255;
4144     break;
4145   case AArch64::LDRHpost:
4146   case AArch64::LDRHpre:
4147   case AArch64::STRHpost:
4148   case AArch64::STRHpre:
4149   case AArch64::LDRHHpost:
4150   case AArch64::LDRHHpre:
4151   case AArch64::STRHHpost:
4152   case AArch64::STRHHpre:
4153     Scale = TypeSize::getFixed(1);
4154     Width = TypeSize::getFixed(2);
4155     MinOffset = -256;
4156     MaxOffset = 255;
4157     break;
4158   case AArch64::LDRBpost:
4159   case AArch64::LDRBpre:
4160   case AArch64::STRBpost:
4161   case AArch64::STRBpre:
4162   case AArch64::LDRBBpost:
4163   case AArch64::LDRBBpre:
4164   case AArch64::STRBBpost:
4165   case AArch64::STRBBpre:
4166     Scale = TypeSize::getFixed(1);
4167     Width = TypeSize::getFixed(1);
4168     MinOffset = -256;
4169     MaxOffset = 255;
4170     break;
4171   // Unscaled
4172   case AArch64::LDURQi:
4173   case AArch64::STURQi:
4174     Scale = TypeSize::getFixed(1);
4175     Width = TypeSize::getFixed(16);
4176     MinOffset = -256;
4177     MaxOffset = 255;
4178     break;
4179   case AArch64::LDURXi:
4180   case AArch64::LDURDi:
4181   case AArch64::LDAPURXi:
4182   case AArch64::STURXi:
4183   case AArch64::STURDi:
4184   case AArch64::STLURXi:
4185   case AArch64::PRFUMi:
4186     Scale = TypeSize::getFixed(1);
4187     Width = TypeSize::getFixed(8);
4188     MinOffset = -256;
4189     MaxOffset = 255;
4190     break;
4191   case AArch64::LDURWi:
4192   case AArch64::LDURSi:
4193   case AArch64::LDURSWi:
4194   case AArch64::LDAPURi:
4195   case AArch64::LDAPURSWi:
4196   case AArch64::STURWi:
4197   case AArch64::STURSi:
4198   case AArch64::STLURWi:
4199     Scale = TypeSize::getFixed(1);
4200     Width = TypeSize::getFixed(4);
4201     MinOffset = -256;
4202     MaxOffset = 255;
4203     break;
4204   case AArch64::LDURHi:
4205   case AArch64::LDURHHi:
4206   case AArch64::LDURSHXi:
4207   case AArch64::LDURSHWi:
4208   case AArch64::LDAPURHi:
4209   case AArch64::LDAPURSHWi:
4210   case AArch64::LDAPURSHXi:
4211   case AArch64::STURHi:
4212   case AArch64::STURHHi:
4213   case AArch64::STLURHi:
4214     Scale = TypeSize::getFixed(1);
4215     Width = TypeSize::getFixed(2);
4216     MinOffset = -256;
4217     MaxOffset = 255;
4218     break;
4219   case AArch64::LDURBi:
4220   case AArch64::LDURBBi:
4221   case AArch64::LDURSBXi:
4222   case AArch64::LDURSBWi:
4223   case AArch64::LDAPURBi:
4224   case AArch64::LDAPURSBWi:
4225   case AArch64::LDAPURSBXi:
4226   case AArch64::STURBi:
4227   case AArch64::STURBBi:
4228   case AArch64::STLURBi:
4229     Scale = TypeSize::getFixed(1);
4230     Width = TypeSize::getFixed(1);
4231     MinOffset = -256;
4232     MaxOffset = 255;
4233     break;
4234   // LDP / STP (including pre/post inc)
4235   case AArch64::LDPQi:
4236   case AArch64::LDNPQi:
4237   case AArch64::STPQi:
4238   case AArch64::STNPQi:
4239   case AArch64::LDPQpost:
4240   case AArch64::LDPQpre:
4241   case AArch64::STPQpost:
4242   case AArch64::STPQpre:
4243     Scale = TypeSize::getFixed(16);
4244     Width = TypeSize::getFixed(16 * 2);
4245     MinOffset = -64;
4246     MaxOffset = 63;
4247     break;
4248   case AArch64::LDPXi:
4249   case AArch64::LDPDi:
4250   case AArch64::LDNPXi:
4251   case AArch64::LDNPDi:
4252   case AArch64::STPXi:
4253   case AArch64::STPDi:
4254   case AArch64::STNPXi:
4255   case AArch64::STNPDi:
4256   case AArch64::LDPDpost:
4257   case AArch64::LDPDpre:
4258   case AArch64::LDPXpost:
4259   case AArch64::LDPXpre:
4260   case AArch64::STPDpost:
4261   case AArch64::STPDpre:
4262   case AArch64::STPXpost:
4263   case AArch64::STPXpre:
4264     Scale = TypeSize::getFixed(8);
4265     Width = TypeSize::getFixed(8 * 2);
4266     MinOffset = -64;
4267     MaxOffset = 63;
4268     break;
4269   case AArch64::LDPWi:
4270   case AArch64::LDPSi:
4271   case AArch64::LDNPWi:
4272   case AArch64::LDNPSi:
4273   case AArch64::STPWi:
4274   case AArch64::STPSi:
4275   case AArch64::STNPWi:
4276   case AArch64::STNPSi:
4277   case AArch64::LDPSpost:
4278   case AArch64::LDPSpre:
4279   case AArch64::LDPWpost:
4280   case AArch64::LDPWpre:
4281   case AArch64::STPSpost:
4282   case AArch64::STPSpre:
4283   case AArch64::STPWpost:
4284   case AArch64::STPWpre:
4285     Scale = TypeSize::getFixed(4);
4286     Width = TypeSize::getFixed(4 * 2);
4287     MinOffset = -64;
4288     MaxOffset = 63;
4289     break;
4290   case AArch64::StoreSwiftAsyncContext:
4291     // Store is an STRXui, but there might be an ADDXri in the expansion too.
4292     Scale = TypeSize::getFixed(1);
4293     Width = TypeSize::getFixed(8);
4294     MinOffset = 0;
4295     MaxOffset = 4095;
4296     break;
4297   case AArch64::ADDG:
4298     Scale = TypeSize::getFixed(16);
4299     Width = TypeSize::getFixed(0);
4300     MinOffset = 0;
4301     MaxOffset = 63;
4302     break;
4303   case AArch64::TAGPstack:
4304     Scale = TypeSize::getFixed(16);
4305     Width = TypeSize::getFixed(0);
4306     // TAGP with a negative offset turns into SUBP, which has a maximum offset
4307     // of 63 (not 64!).
4308     MinOffset = -63;
4309     MaxOffset = 63;
4310     break;
4311   case AArch64::LDG:
4312   case AArch64::STGi:
4313   case AArch64::STGPreIndex:
4314   case AArch64::STGPostIndex:
4315   case AArch64::STZGi:
4316   case AArch64::STZGPreIndex:
4317   case AArch64::STZGPostIndex:
4318     Scale = TypeSize::getFixed(16);
4319     Width = TypeSize::getFixed(16);
4320     MinOffset = -256;
4321     MaxOffset = 255;
4322     break;
4323   // SVE
4324   case AArch64::STR_ZZZZXI:
4325   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4326   case AArch64::LDR_ZZZZXI:
4327   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4328     Scale = TypeSize::getScalable(16);
4329     Width = TypeSize::getScalable(16 * 4);
4330     MinOffset = -256;
4331     MaxOffset = 252;
4332     break;
4333   case AArch64::STR_ZZZXI:
4334   case AArch64::LDR_ZZZXI:
4335     Scale = TypeSize::getScalable(16);
4336     Width = TypeSize::getScalable(16 * 3);
4337     MinOffset = -256;
4338     MaxOffset = 253;
4339     break;
4340   case AArch64::STR_ZZXI:
4341   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4342   case AArch64::LDR_ZZXI:
4343   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4344     Scale = TypeSize::getScalable(16);
4345     Width = TypeSize::getScalable(16 * 2);
4346     MinOffset = -256;
4347     MaxOffset = 254;
4348     break;
4349   case AArch64::LDR_PXI:
4350   case AArch64::STR_PXI:
4351     Scale = TypeSize::getScalable(2);
4352     Width = TypeSize::getScalable(2);
4353     MinOffset = -256;
4354     MaxOffset = 255;
4355     break;
4356   case AArch64::LDR_PPXI:
4357   case AArch64::STR_PPXI:
4358     Scale = TypeSize::getScalable(2);
4359     Width = TypeSize::getScalable(2 * 2);
4360     MinOffset = -256;
4361     MaxOffset = 254;
4362     break;
4363   case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4364   case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4365   case AArch64::LDR_ZXI:
4366   case AArch64::STR_ZXI:
4367     Scale = TypeSize::getScalable(16);
4368     Width = TypeSize::getScalable(16);
4369     MinOffset = -256;
4370     MaxOffset = 255;
4371     break;
4372   case AArch64::LD1B_IMM:
4373   case AArch64::LD1H_IMM:
4374   case AArch64::LD1W_IMM:
4375   case AArch64::LD1D_IMM:
4376   case AArch64::LDNT1B_ZRI:
4377   case AArch64::LDNT1H_ZRI:
4378   case AArch64::LDNT1W_ZRI:
4379   case AArch64::LDNT1D_ZRI:
4380   case AArch64::ST1B_IMM:
4381   case AArch64::ST1H_IMM:
4382   case AArch64::ST1W_IMM:
4383   case AArch64::ST1D_IMM:
4384   case AArch64::STNT1B_ZRI:
4385   case AArch64::STNT1H_ZRI:
4386   case AArch64::STNT1W_ZRI:
4387   case AArch64::STNT1D_ZRI:
4388   case AArch64::LDNF1B_IMM:
4389   case AArch64::LDNF1H_IMM:
4390   case AArch64::LDNF1W_IMM:
4391   case AArch64::LDNF1D_IMM:
4392     // A full vectors worth of data
4393     // Width = mbytes * elements
4394     Scale = TypeSize::getScalable(16);
4395     Width = TypeSize::getScalable(16);
4396     MinOffset = -8;
4397     MaxOffset = 7;
4398     break;
4399   case AArch64::LD2B_IMM:
4400   case AArch64::LD2H_IMM:
4401   case AArch64::LD2W_IMM:
4402   case AArch64::LD2D_IMM:
4403   case AArch64::ST2B_IMM:
4404   case AArch64::ST2H_IMM:
4405   case AArch64::ST2W_IMM:
4406   case AArch64::ST2D_IMM:
4407     Scale = TypeSize::getScalable(32);
4408     Width = TypeSize::getScalable(16 * 2);
4409     MinOffset = -8;
4410     MaxOffset = 7;
4411     break;
4412   case AArch64::LD3B_IMM:
4413   case AArch64::LD3H_IMM:
4414   case AArch64::LD3W_IMM:
4415   case AArch64::LD3D_IMM:
4416   case AArch64::ST3B_IMM:
4417   case AArch64::ST3H_IMM:
4418   case AArch64::ST3W_IMM:
4419   case AArch64::ST3D_IMM:
4420     Scale = TypeSize::getScalable(48);
4421     Width = TypeSize::getScalable(16 * 3);
4422     MinOffset = -8;
4423     MaxOffset = 7;
4424     break;
4425   case AArch64::LD4B_IMM:
4426   case AArch64::LD4H_IMM:
4427   case AArch64::LD4W_IMM:
4428   case AArch64::LD4D_IMM:
4429   case AArch64::ST4B_IMM:
4430   case AArch64::ST4H_IMM:
4431   case AArch64::ST4W_IMM:
4432   case AArch64::ST4D_IMM:
4433     Scale = TypeSize::getScalable(64);
4434     Width = TypeSize::getScalable(16 * 4);
4435     MinOffset = -8;
4436     MaxOffset = 7;
4437     break;
4438   case AArch64::LD1B_H_IMM:
4439   case AArch64::LD1SB_H_IMM:
4440   case AArch64::LD1H_S_IMM:
4441   case AArch64::LD1SH_S_IMM:
4442   case AArch64::LD1W_D_IMM:
4443   case AArch64::LD1SW_D_IMM:
4444   case AArch64::ST1B_H_IMM:
4445   case AArch64::ST1H_S_IMM:
4446   case AArch64::ST1W_D_IMM:
4447   case AArch64::LDNF1B_H_IMM:
4448   case AArch64::LDNF1SB_H_IMM:
4449   case AArch64::LDNF1H_S_IMM:
4450   case AArch64::LDNF1SH_S_IMM:
4451   case AArch64::LDNF1W_D_IMM:
4452   case AArch64::LDNF1SW_D_IMM:
4453     // A half vector worth of data
4454     // Width = mbytes * elements
4455     Scale = TypeSize::getScalable(8);
4456     Width = TypeSize::getScalable(8);
4457     MinOffset = -8;
4458     MaxOffset = 7;
4459     break;
4460   case AArch64::LD1B_S_IMM:
4461   case AArch64::LD1SB_S_IMM:
4462   case AArch64::LD1H_D_IMM:
4463   case AArch64::LD1SH_D_IMM:
4464   case AArch64::ST1B_S_IMM:
4465   case AArch64::ST1H_D_IMM:
4466   case AArch64::LDNF1B_S_IMM:
4467   case AArch64::LDNF1SB_S_IMM:
4468   case AArch64::LDNF1H_D_IMM:
4469   case AArch64::LDNF1SH_D_IMM:
4470     // A quarter vector worth of data
4471     // Width = mbytes * elements
4472     Scale = TypeSize::getScalable(4);
4473     Width = TypeSize::getScalable(4);
4474     MinOffset = -8;
4475     MaxOffset = 7;
4476     break;
4477   case AArch64::LD1B_D_IMM:
4478   case AArch64::LD1SB_D_IMM:
4479   case AArch64::ST1B_D_IMM:
4480   case AArch64::LDNF1B_D_IMM:
4481   case AArch64::LDNF1SB_D_IMM:
4482     // A eighth vector worth of data
4483     // Width = mbytes * elements
4484     Scale = TypeSize::getScalable(2);
4485     Width = TypeSize::getScalable(2);
4486     MinOffset = -8;
4487     MaxOffset = 7;
4488     break;
4489   case AArch64::ST2Gi:
4490   case AArch64::ST2GPreIndex:
4491   case AArch64::ST2GPostIndex:
4492   case AArch64::STZ2Gi:
4493   case AArch64::STZ2GPreIndex:
4494   case AArch64::STZ2GPostIndex:
4495     Scale = TypeSize::getFixed(16);
4496     Width = TypeSize::getFixed(32);
4497     MinOffset = -256;
4498     MaxOffset = 255;
4499     break;
4500   case AArch64::STGPi:
4501   case AArch64::STGPpost:
4502   case AArch64::STGPpre:
4503     Scale = TypeSize::getFixed(16);
4504     Width = TypeSize::getFixed(16);
4505     MinOffset = -64;
4506     MaxOffset = 63;
4507     break;
4508   case AArch64::LD1RB_IMM:
4509   case AArch64::LD1RB_H_IMM:
4510   case AArch64::LD1RB_S_IMM:
4511   case AArch64::LD1RB_D_IMM:
4512   case AArch64::LD1RSB_H_IMM:
4513   case AArch64::LD1RSB_S_IMM:
4514   case AArch64::LD1RSB_D_IMM:
4515     Scale = TypeSize::getFixed(1);
4516     Width = TypeSize::getFixed(1);
4517     MinOffset = 0;
4518     MaxOffset = 63;
4519     break;
4520   case AArch64::LD1RH_IMM:
4521   case AArch64::LD1RH_S_IMM:
4522   case AArch64::LD1RH_D_IMM:
4523   case AArch64::LD1RSH_S_IMM:
4524   case AArch64::LD1RSH_D_IMM:
4525     Scale = TypeSize::getFixed(2);
4526     Width = TypeSize::getFixed(2);
4527     MinOffset = 0;
4528     MaxOffset = 63;
4529     break;
4530   case AArch64::LD1RW_IMM:
4531   case AArch64::LD1RW_D_IMM:
4532   case AArch64::LD1RSW_IMM:
4533     Scale = TypeSize::getFixed(4);
4534     Width = TypeSize::getFixed(4);
4535     MinOffset = 0;
4536     MaxOffset = 63;
4537     break;
4538   case AArch64::LD1RD_IMM:
4539     Scale = TypeSize::getFixed(8);
4540     Width = TypeSize::getFixed(8);
4541     MinOffset = 0;
4542     MaxOffset = 63;
4543     break;
4544   }
4545 
4546   return true;
4547 }
4548 
4549 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)4550 int AArch64InstrInfo::getMemScale(unsigned Opc) {
4551   switch (Opc) {
4552   default:
4553     llvm_unreachable("Opcode has unknown scale!");
4554   case AArch64::LDRBBui:
4555   case AArch64::LDURBBi:
4556   case AArch64::LDRSBWui:
4557   case AArch64::LDURSBWi:
4558   case AArch64::STRBBui:
4559   case AArch64::STURBBi:
4560     return 1;
4561   case AArch64::LDRHHui:
4562   case AArch64::LDURHHi:
4563   case AArch64::LDRSHWui:
4564   case AArch64::LDURSHWi:
4565   case AArch64::STRHHui:
4566   case AArch64::STURHHi:
4567     return 2;
4568   case AArch64::LDRSui:
4569   case AArch64::LDURSi:
4570   case AArch64::LDRSpre:
4571   case AArch64::LDRSWui:
4572   case AArch64::LDURSWi:
4573   case AArch64::LDRSWpre:
4574   case AArch64::LDRWpre:
4575   case AArch64::LDRWui:
4576   case AArch64::LDURWi:
4577   case AArch64::STRSui:
4578   case AArch64::STURSi:
4579   case AArch64::STRSpre:
4580   case AArch64::STRWui:
4581   case AArch64::STURWi:
4582   case AArch64::STRWpre:
4583   case AArch64::LDPSi:
4584   case AArch64::LDPSWi:
4585   case AArch64::LDPWi:
4586   case AArch64::STPSi:
4587   case AArch64::STPWi:
4588     return 4;
4589   case AArch64::LDRDui:
4590   case AArch64::LDURDi:
4591   case AArch64::LDRDpre:
4592   case AArch64::LDRXui:
4593   case AArch64::LDURXi:
4594   case AArch64::LDRXpre:
4595   case AArch64::STRDui:
4596   case AArch64::STURDi:
4597   case AArch64::STRDpre:
4598   case AArch64::STRXui:
4599   case AArch64::STURXi:
4600   case AArch64::STRXpre:
4601   case AArch64::LDPDi:
4602   case AArch64::LDPXi:
4603   case AArch64::STPDi:
4604   case AArch64::STPXi:
4605     return 8;
4606   case AArch64::LDRQui:
4607   case AArch64::LDURQi:
4608   case AArch64::STRQui:
4609   case AArch64::STURQi:
4610   case AArch64::STRQpre:
4611   case AArch64::LDPQi:
4612   case AArch64::LDRQpre:
4613   case AArch64::STPQi:
4614   case AArch64::STGi:
4615   case AArch64::STZGi:
4616   case AArch64::ST2Gi:
4617   case AArch64::STZ2Gi:
4618   case AArch64::STGPi:
4619     return 16;
4620   }
4621 }
4622 
isPreLd(const MachineInstr & MI)4623 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4624   switch (MI.getOpcode()) {
4625   default:
4626     return false;
4627   case AArch64::LDRWpre:
4628   case AArch64::LDRXpre:
4629   case AArch64::LDRSWpre:
4630   case AArch64::LDRSpre:
4631   case AArch64::LDRDpre:
4632   case AArch64::LDRQpre:
4633     return true;
4634   }
4635 }
4636 
isPreSt(const MachineInstr & MI)4637 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4638   switch (MI.getOpcode()) {
4639   default:
4640     return false;
4641   case AArch64::STRWpre:
4642   case AArch64::STRXpre:
4643   case AArch64::STRSpre:
4644   case AArch64::STRDpre:
4645   case AArch64::STRQpre:
4646     return true;
4647   }
4648 }
4649 
isPreLdSt(const MachineInstr & MI)4650 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4651   return isPreLd(MI) || isPreSt(MI);
4652 }
4653 
isPairedLdSt(const MachineInstr & MI)4654 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4655   switch (MI.getOpcode()) {
4656   default:
4657     return false;
4658   case AArch64::LDPSi:
4659   case AArch64::LDPSWi:
4660   case AArch64::LDPDi:
4661   case AArch64::LDPQi:
4662   case AArch64::LDPWi:
4663   case AArch64::LDPXi:
4664   case AArch64::STPSi:
4665   case AArch64::STPDi:
4666   case AArch64::STPQi:
4667   case AArch64::STPWi:
4668   case AArch64::STPXi:
4669   case AArch64::STGPi:
4670     return true;
4671   }
4672 }
4673 
getLdStBaseOp(const MachineInstr & MI)4674 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4675   assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4676   unsigned Idx =
4677       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4678                                                                             : 1;
4679   return MI.getOperand(Idx);
4680 }
4681 
4682 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4683 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4684   assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4685   unsigned Idx =
4686       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4687                                                                             : 2;
4688   return MI.getOperand(Idx);
4689 }
4690 
4691 const MachineOperand &
getLdStAmountOp(const MachineInstr & MI)4692 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4693   switch (MI.getOpcode()) {
4694   default:
4695     llvm_unreachable("Unexpected opcode");
4696   case AArch64::LDRBroX:
4697   case AArch64::LDRBBroX:
4698   case AArch64::LDRSBXroX:
4699   case AArch64::LDRSBWroX:
4700   case AArch64::LDRHroX:
4701   case AArch64::LDRHHroX:
4702   case AArch64::LDRSHXroX:
4703   case AArch64::LDRSHWroX:
4704   case AArch64::LDRWroX:
4705   case AArch64::LDRSroX:
4706   case AArch64::LDRSWroX:
4707   case AArch64::LDRDroX:
4708   case AArch64::LDRXroX:
4709   case AArch64::LDRQroX:
4710     return MI.getOperand(4);
4711   }
4712 }
4713 
getRegClass(const MachineInstr & MI,Register Reg)4714 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4715                                               Register Reg) {
4716   if (MI.getParent() == nullptr)
4717     return nullptr;
4718   const MachineFunction *MF = MI.getParent()->getParent();
4719   return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4720 }
4721 
isHForm(const MachineInstr & MI)4722 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4723   auto IsHFPR = [&](const MachineOperand &Op) {
4724     if (!Op.isReg())
4725       return false;
4726     auto Reg = Op.getReg();
4727     if (Reg.isPhysical())
4728       return AArch64::FPR16RegClass.contains(Reg);
4729     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4730     return TRC == &AArch64::FPR16RegClass ||
4731            TRC == &AArch64::FPR16_loRegClass;
4732   };
4733   return llvm::any_of(MI.operands(), IsHFPR);
4734 }
4735 
isQForm(const MachineInstr & MI)4736 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4737   auto IsQFPR = [&](const MachineOperand &Op) {
4738     if (!Op.isReg())
4739       return false;
4740     auto Reg = Op.getReg();
4741     if (Reg.isPhysical())
4742       return AArch64::FPR128RegClass.contains(Reg);
4743     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4744     return TRC == &AArch64::FPR128RegClass ||
4745            TRC == &AArch64::FPR128_loRegClass;
4746   };
4747   return llvm::any_of(MI.operands(), IsQFPR);
4748 }
4749 
hasBTISemantics(const MachineInstr & MI)4750 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4751   switch (MI.getOpcode()) {
4752   case AArch64::BRK:
4753   case AArch64::HLT:
4754   case AArch64::PACIASP:
4755   case AArch64::PACIBSP:
4756     // Implicit BTI behavior.
4757     return true;
4758   case AArch64::PAUTH_PROLOGUE:
4759     // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4760     return true;
4761   case AArch64::HINT: {
4762     unsigned Imm = MI.getOperand(0).getImm();
4763     // Explicit BTI instruction.
4764     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4765       return true;
4766     // PACI(A|B)SP instructions.
4767     if (Imm == 25 || Imm == 27)
4768       return true;
4769     return false;
4770   }
4771   default:
4772     return false;
4773   }
4774 }
4775 
isFpOrNEON(Register Reg)4776 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4777   if (Reg == 0)
4778     return false;
4779   assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4780   return AArch64::FPR128RegClass.contains(Reg) ||
4781          AArch64::FPR64RegClass.contains(Reg) ||
4782          AArch64::FPR32RegClass.contains(Reg) ||
4783          AArch64::FPR16RegClass.contains(Reg) ||
4784          AArch64::FPR8RegClass.contains(Reg);
4785 }
4786 
isFpOrNEON(const MachineInstr & MI)4787 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4788   auto IsFPR = [&](const MachineOperand &Op) {
4789     if (!Op.isReg())
4790       return false;
4791     auto Reg = Op.getReg();
4792     if (Reg.isPhysical())
4793       return isFpOrNEON(Reg);
4794 
4795     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4796     return TRC == &AArch64::FPR128RegClass ||
4797            TRC == &AArch64::FPR128_loRegClass ||
4798            TRC == &AArch64::FPR64RegClass ||
4799            TRC == &AArch64::FPR64_loRegClass ||
4800            TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4801            TRC == &AArch64::FPR8RegClass;
4802   };
4803   return llvm::any_of(MI.operands(), IsFPR);
4804 }
4805 
4806 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4807 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4808 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4809   int Scale = AArch64InstrInfo::getMemScale(Opc);
4810 
4811   // If the byte-offset isn't a multiple of the stride, we can't scale this
4812   // offset.
4813   if (Offset % Scale != 0)
4814     return false;
4815 
4816   // Convert the byte-offset used by unscaled into an "element" offset used
4817   // by the scaled pair load/store instructions.
4818   Offset /= Scale;
4819   return true;
4820 }
4821 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4822 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4823   if (FirstOpc == SecondOpc)
4824     return true;
4825   // We can also pair sign-ext and zero-ext instructions.
4826   switch (FirstOpc) {
4827   default:
4828     return false;
4829   case AArch64::STRSui:
4830   case AArch64::STURSi:
4831     return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4832   case AArch64::STRDui:
4833   case AArch64::STURDi:
4834     return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4835   case AArch64::STRQui:
4836   case AArch64::STURQi:
4837     return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4838   case AArch64::STRWui:
4839   case AArch64::STURWi:
4840     return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4841   case AArch64::STRXui:
4842   case AArch64::STURXi:
4843     return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4844   case AArch64::LDRSui:
4845   case AArch64::LDURSi:
4846     return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4847   case AArch64::LDRDui:
4848   case AArch64::LDURDi:
4849     return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4850   case AArch64::LDRQui:
4851   case AArch64::LDURQi:
4852     return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4853   case AArch64::LDRWui:
4854   case AArch64::LDURWi:
4855     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4856   case AArch64::LDRSWui:
4857   case AArch64::LDURSWi:
4858     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4859   case AArch64::LDRXui:
4860   case AArch64::LDURXi:
4861     return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4862   }
4863   // These instructions can't be paired based on their opcodes.
4864   return false;
4865 }
4866 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4867 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4868                             int64_t Offset1, unsigned Opcode1, int FI2,
4869                             int64_t Offset2, unsigned Opcode2) {
4870   // Accesses through fixed stack object frame indices may access a different
4871   // fixed stack slot. Check that the object offsets + offsets match.
4872   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4873     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4874     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4875     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4876     // Convert to scaled object offsets.
4877     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4878     if (ObjectOffset1 % Scale1 != 0)
4879       return false;
4880     ObjectOffset1 /= Scale1;
4881     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4882     if (ObjectOffset2 % Scale2 != 0)
4883       return false;
4884     ObjectOffset2 /= Scale2;
4885     ObjectOffset1 += Offset1;
4886     ObjectOffset2 += Offset2;
4887     return ObjectOffset1 + 1 == ObjectOffset2;
4888   }
4889 
4890   return FI1 == FI2;
4891 }
4892 
4893 /// Detect opportunities for ldp/stp formation.
4894 ///
4895 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4896 bool AArch64InstrInfo::shouldClusterMemOps(
4897     ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4898     bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4899     int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4900     unsigned NumBytes) const {
4901   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4902   const MachineOperand &BaseOp1 = *BaseOps1.front();
4903   const MachineOperand &BaseOp2 = *BaseOps2.front();
4904   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4905   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4906   if (BaseOp1.getType() != BaseOp2.getType())
4907     return false;
4908 
4909   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4910          "Only base registers and frame indices are supported.");
4911 
4912   // Check for both base regs and base FI.
4913   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4914     return false;
4915 
4916   // Only cluster up to a single pair.
4917   if (ClusterSize > 2)
4918     return false;
4919 
4920   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4921     return false;
4922 
4923   // Can we pair these instructions based on their opcodes?
4924   unsigned FirstOpc = FirstLdSt.getOpcode();
4925   unsigned SecondOpc = SecondLdSt.getOpcode();
4926   if (!canPairLdStOpc(FirstOpc, SecondOpc))
4927     return false;
4928 
4929   // Can't merge volatiles or load/stores that have a hint to avoid pair
4930   // formation, for example.
4931   if (!isCandidateToMergeOrPair(FirstLdSt) ||
4932       !isCandidateToMergeOrPair(SecondLdSt))
4933     return false;
4934 
4935   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4936   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4937   if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4938     return false;
4939 
4940   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4941   if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4942     return false;
4943 
4944   // Pairwise instructions have a 7-bit signed offset field.
4945   if (Offset1 > 63 || Offset1 < -64)
4946     return false;
4947 
4948   // The caller should already have ordered First/SecondLdSt by offset.
4949   // Note: except for non-equal frame index bases
4950   if (BaseOp1.isFI()) {
4951     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4952            "Caller should have ordered offsets.");
4953 
4954     const MachineFrameInfo &MFI =
4955         FirstLdSt.getParent()->getParent()->getFrameInfo();
4956     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4957                            BaseOp2.getIndex(), Offset2, SecondOpc);
4958   }
4959 
4960   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4961 
4962   return Offset1 + 1 == Offset2;
4963 }
4964 
AddSubReg(const MachineInstrBuilder & MIB,MCRegister Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4965 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4966                                             MCRegister Reg, unsigned SubIdx,
4967                                             unsigned State,
4968                                             const TargetRegisterInfo *TRI) {
4969   if (!SubIdx)
4970     return MIB.addReg(Reg, State);
4971 
4972   if (Reg.isPhysical())
4973     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4974   return MIB.addReg(Reg, State, SubIdx);
4975 }
4976 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4977 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4978                                         unsigned NumRegs) {
4979   // We really want the positive remainder mod 32 here, that happens to be
4980   // easily obtainable with a mask.
4981   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4982 }
4983 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4984 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4985                                         MachineBasicBlock::iterator I,
4986                                         const DebugLoc &DL, MCRegister DestReg,
4987                                         MCRegister SrcReg, bool KillSrc,
4988                                         unsigned Opcode,
4989                                         ArrayRef<unsigned> Indices) const {
4990   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4991   const TargetRegisterInfo *TRI = &getRegisterInfo();
4992   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4993   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4994   unsigned NumRegs = Indices.size();
4995 
4996   int SubReg = 0, End = NumRegs, Incr = 1;
4997   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4998     SubReg = NumRegs - 1;
4999     End = -1;
5000     Incr = -1;
5001   }
5002 
5003   for (; SubReg != End; SubReg += Incr) {
5004     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5005     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5006     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
5007     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5008   }
5009 }
5010 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const5011 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5012                                        MachineBasicBlock::iterator I,
5013                                        const DebugLoc &DL, MCRegister DestReg,
5014                                        MCRegister SrcReg, bool KillSrc,
5015                                        unsigned Opcode, unsigned ZeroReg,
5016                                        llvm::ArrayRef<unsigned> Indices) const {
5017   const TargetRegisterInfo *TRI = &getRegisterInfo();
5018   unsigned NumRegs = Indices.size();
5019 
5020 #ifndef NDEBUG
5021   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5022   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5023   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5024          "GPR reg sequences should not be able to overlap");
5025 #endif
5026 
5027   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5028     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5029     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5030     MIB.addReg(ZeroReg);
5031     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5032     MIB.addImm(0);
5033   }
5034 }
5035 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DestReg,Register SrcReg,bool KillSrc,bool RenamableDest,bool RenamableSrc) const5036 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5037                                    MachineBasicBlock::iterator I,
5038                                    const DebugLoc &DL, Register DestReg,
5039                                    Register SrcReg, bool KillSrc,
5040                                    bool RenamableDest,
5041                                    bool RenamableSrc) const {
5042   if (AArch64::GPR32spRegClass.contains(DestReg) &&
5043       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
5044     const TargetRegisterInfo *TRI = &getRegisterInfo();
5045 
5046     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5047       // If either operand is WSP, expand to ADD #0.
5048       if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5049           !Subtarget.hasZeroCycleRegMoveGPR32()) {
5050         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5051         MCRegister DestRegX = TRI->getMatchingSuperReg(
5052             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5053         MCRegister SrcRegX = TRI->getMatchingSuperReg(
5054             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5055         // This instruction is reading and writing X registers.  This may upset
5056         // the register scavenger and machine verifier, so we need to indicate
5057         // that we are reading an undefined value from SrcRegX, but a proper
5058         // value from SrcReg.
5059         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5060             .addReg(SrcRegX, RegState::Undef)
5061             .addImm(0)
5062             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
5063             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5064       } else {
5065         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5066             .addReg(SrcReg, getKillRegState(KillSrc))
5067             .addImm(0)
5068             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5069       }
5070     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
5071       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5072           .addImm(0)
5073           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5074     } else {
5075       if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5076           !Subtarget.hasZeroCycleRegMoveGPR32()) {
5077         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5078         MCRegister DestRegX = TRI->getMatchingSuperReg(
5079             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5080         MCRegister SrcRegX = TRI->getMatchingSuperReg(
5081             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5082         // This instruction is reading and writing X registers.  This may upset
5083         // the register scavenger and machine verifier, so we need to indicate
5084         // that we are reading an undefined value from SrcRegX, but a proper
5085         // value from SrcReg.
5086         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5087             .addReg(AArch64::XZR)
5088             .addReg(SrcRegX, RegState::Undef)
5089             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5090       } else {
5091         // Otherwise, expand to ORR WZR.
5092         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5093             .addReg(AArch64::WZR)
5094             .addReg(SrcReg, getKillRegState(KillSrc));
5095       }
5096     }
5097     return;
5098   }
5099 
5100   // Copy a Predicate register by ORRing with itself.
5101   if (AArch64::PPRRegClass.contains(DestReg) &&
5102       AArch64::PPRRegClass.contains(SrcReg)) {
5103     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104            "Unexpected SVE register.");
5105     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5106       .addReg(SrcReg) // Pg
5107       .addReg(SrcReg)
5108       .addReg(SrcReg, getKillRegState(KillSrc));
5109     return;
5110   }
5111 
5112   // Copy a predicate-as-counter register by ORRing with itself as if it
5113   // were a regular predicate (mask) register.
5114   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5115   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5116   if (DestIsPNR || SrcIsPNR) {
5117     auto ToPPR = [](MCRegister R) -> MCRegister {
5118       return (R - AArch64::PN0) + AArch64::P0;
5119     };
5120     MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5121     MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5122 
5123     if (PPRSrcReg != PPRDestReg) {
5124       auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5125                        .addReg(PPRSrcReg) // Pg
5126                        .addReg(PPRSrcReg)
5127                        .addReg(PPRSrcReg, getKillRegState(KillSrc));
5128       if (DestIsPNR)
5129         NewMI.addDef(DestReg, RegState::Implicit);
5130     }
5131     return;
5132   }
5133 
5134   // Copy a Z register by ORRing with itself.
5135   if (AArch64::ZPRRegClass.contains(DestReg) &&
5136       AArch64::ZPRRegClass.contains(SrcReg)) {
5137     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5138            "Unexpected SVE register.");
5139     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5140       .addReg(SrcReg)
5141       .addReg(SrcReg, getKillRegState(KillSrc));
5142     return;
5143   }
5144 
5145   // Copy a Z register pair by copying the individual sub-registers.
5146   if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5147        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5148       (AArch64::ZPR2RegClass.contains(SrcReg) ||
5149        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5150     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5151            "Unexpected SVE register.");
5152     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5153     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5154                      Indices);
5155     return;
5156   }
5157 
5158   // Copy a Z register triple by copying the individual sub-registers.
5159   if (AArch64::ZPR3RegClass.contains(DestReg) &&
5160       AArch64::ZPR3RegClass.contains(SrcReg)) {
5161     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5162            "Unexpected SVE register.");
5163     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5164                                        AArch64::zsub2};
5165     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5166                      Indices);
5167     return;
5168   }
5169 
5170   // Copy a Z register quad by copying the individual sub-registers.
5171   if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5172        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5173       (AArch64::ZPR4RegClass.contains(SrcReg) ||
5174        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5175     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5176            "Unexpected SVE register.");
5177     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5178                                        AArch64::zsub2, AArch64::zsub3};
5179     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5180                      Indices);
5181     return;
5182   }
5183 
5184   if (AArch64::GPR64spRegClass.contains(DestReg) &&
5185       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
5186     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5187       // If either operand is SP, expand to ADD #0.
5188       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5189           .addReg(SrcReg, getKillRegState(KillSrc))
5190           .addImm(0)
5191           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5192     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5193       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5194           .addImm(0)
5195           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5196     } else {
5197       // Otherwise, expand to ORR XZR.
5198       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5199           .addReg(AArch64::XZR)
5200           .addReg(SrcReg, getKillRegState(KillSrc));
5201     }
5202     return;
5203   }
5204 
5205   // Copy a DDDD register quad by copying the individual sub-registers.
5206   if (AArch64::DDDDRegClass.contains(DestReg) &&
5207       AArch64::DDDDRegClass.contains(SrcReg)) {
5208     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5209                                        AArch64::dsub2, AArch64::dsub3};
5210     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5211                      Indices);
5212     return;
5213   }
5214 
5215   // Copy a DDD register triple by copying the individual sub-registers.
5216   if (AArch64::DDDRegClass.contains(DestReg) &&
5217       AArch64::DDDRegClass.contains(SrcReg)) {
5218     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5219                                        AArch64::dsub2};
5220     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5221                      Indices);
5222     return;
5223   }
5224 
5225   // Copy a DD register pair by copying the individual sub-registers.
5226   if (AArch64::DDRegClass.contains(DestReg) &&
5227       AArch64::DDRegClass.contains(SrcReg)) {
5228     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5229     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5230                      Indices);
5231     return;
5232   }
5233 
5234   // Copy a QQQQ register quad by copying the individual sub-registers.
5235   if (AArch64::QQQQRegClass.contains(DestReg) &&
5236       AArch64::QQQQRegClass.contains(SrcReg)) {
5237     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5238                                        AArch64::qsub2, AArch64::qsub3};
5239     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5240                      Indices);
5241     return;
5242   }
5243 
5244   // Copy a QQQ register triple by copying the individual sub-registers.
5245   if (AArch64::QQQRegClass.contains(DestReg) &&
5246       AArch64::QQQRegClass.contains(SrcReg)) {
5247     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5248                                        AArch64::qsub2};
5249     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5250                      Indices);
5251     return;
5252   }
5253 
5254   // Copy a QQ register pair by copying the individual sub-registers.
5255   if (AArch64::QQRegClass.contains(DestReg) &&
5256       AArch64::QQRegClass.contains(SrcReg)) {
5257     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5258     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5259                      Indices);
5260     return;
5261   }
5262 
5263   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5264       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5265     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5266     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5267                     AArch64::XZR, Indices);
5268     return;
5269   }
5270 
5271   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5272       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5273     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5274     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5275                     AArch64::WZR, Indices);
5276     return;
5277   }
5278 
5279   if (AArch64::FPR128RegClass.contains(DestReg) &&
5280       AArch64::FPR128RegClass.contains(SrcReg)) {
5281     if (Subtarget.isSVEorStreamingSVEAvailable() &&
5282         !Subtarget.isNeonAvailable())
5283       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5284           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5285           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5286           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5287     else if (Subtarget.isNeonAvailable())
5288       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5289           .addReg(SrcReg)
5290           .addReg(SrcReg, getKillRegState(KillSrc));
5291     else {
5292       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5293           .addReg(AArch64::SP, RegState::Define)
5294           .addReg(SrcReg, getKillRegState(KillSrc))
5295           .addReg(AArch64::SP)
5296           .addImm(-16);
5297       BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5298           .addReg(AArch64::SP, RegState::Define)
5299           .addReg(DestReg, RegState::Define)
5300           .addReg(AArch64::SP)
5301           .addImm(16);
5302     }
5303     return;
5304   }
5305 
5306   if (AArch64::FPR64RegClass.contains(DestReg) &&
5307       AArch64::FPR64RegClass.contains(SrcReg)) {
5308     BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5309         .addReg(SrcReg, getKillRegState(KillSrc));
5310     return;
5311   }
5312 
5313   if (AArch64::FPR32RegClass.contains(DestReg) &&
5314       AArch64::FPR32RegClass.contains(SrcReg)) {
5315     if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5316         !Subtarget.hasZeroCycleRegMoveFPR32()) {
5317       const TargetRegisterInfo *TRI = &getRegisterInfo();
5318       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
5319                                                      &AArch64::FPR64RegClass);
5320       MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
5321                                                     &AArch64::FPR64RegClass);
5322       // This instruction is reading and writing D registers. This may upset
5323       // the register scavenger and machine verifier, so we need to indicate
5324       // that we are reading an undefined value from SrcRegD, but a proper
5325       // value from SrcReg.
5326       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5327           .addReg(SrcRegD, RegState::Undef)
5328           .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5329     } else {
5330       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5331           .addReg(SrcReg, getKillRegState(KillSrc));
5332     }
5333     return;
5334   }
5335 
5336   if (AArch64::FPR16RegClass.contains(DestReg) &&
5337       AArch64::FPR16RegClass.contains(SrcReg)) {
5338     if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5339         !Subtarget.hasZeroCycleRegMoveFPR32()) {
5340       const TargetRegisterInfo *TRI = &getRegisterInfo();
5341       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
5342                                                      &AArch64::FPR64RegClass);
5343       MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
5344                                                     &AArch64::FPR64RegClass);
5345       // This instruction is reading and writing D registers. This may upset
5346       // the register scavenger and machine verifier, so we need to indicate
5347       // that we are reading an undefined value from SrcRegD, but a proper
5348       // value from SrcReg.
5349       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5350           .addReg(SrcRegD, RegState::Undef)
5351           .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5352     } else {
5353       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5354                                        &AArch64::FPR32RegClass);
5355       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5356                                       &AArch64::FPR32RegClass);
5357       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5358           .addReg(SrcReg, getKillRegState(KillSrc));
5359     }
5360     return;
5361   }
5362 
5363   if (AArch64::FPR8RegClass.contains(DestReg) &&
5364       AArch64::FPR8RegClass.contains(SrcReg)) {
5365     if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5366         !Subtarget.hasZeroCycleRegMoveFPR32()) {
5367       const TargetRegisterInfo *TRI = &getRegisterInfo();
5368       MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
5369                                                      &AArch64::FPR64RegClass);
5370       MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
5371                                                     &AArch64::FPR64RegClass);
5372       // This instruction is reading and writing D registers. This may upset
5373       // the register scavenger and machine verifier, so we need to indicate
5374       // that we are reading an undefined value from SrcRegD, but a proper
5375       // value from SrcReg.
5376       BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5377           .addReg(SrcRegD, RegState::Undef)
5378           .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5379     } else {
5380       DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5381                                        &AArch64::FPR32RegClass);
5382       SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5383                                       &AArch64::FPR32RegClass);
5384       BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5385           .addReg(SrcReg, getKillRegState(KillSrc));
5386     }
5387     return;
5388   }
5389 
5390   // Copies between GPR64 and FPR64.
5391   if (AArch64::FPR64RegClass.contains(DestReg) &&
5392       AArch64::GPR64RegClass.contains(SrcReg)) {
5393     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5394         .addReg(SrcReg, getKillRegState(KillSrc));
5395     return;
5396   }
5397   if (AArch64::GPR64RegClass.contains(DestReg) &&
5398       AArch64::FPR64RegClass.contains(SrcReg)) {
5399     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5400         .addReg(SrcReg, getKillRegState(KillSrc));
5401     return;
5402   }
5403   // Copies between GPR32 and FPR32.
5404   if (AArch64::FPR32RegClass.contains(DestReg) &&
5405       AArch64::GPR32RegClass.contains(SrcReg)) {
5406     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5407         .addReg(SrcReg, getKillRegState(KillSrc));
5408     return;
5409   }
5410   if (AArch64::GPR32RegClass.contains(DestReg) &&
5411       AArch64::FPR32RegClass.contains(SrcReg)) {
5412     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5413         .addReg(SrcReg, getKillRegState(KillSrc));
5414     return;
5415   }
5416 
5417   if (DestReg == AArch64::NZCV) {
5418     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5419     BuildMI(MBB, I, DL, get(AArch64::MSR))
5420         .addImm(AArch64SysReg::NZCV)
5421         .addReg(SrcReg, getKillRegState(KillSrc))
5422         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5423     return;
5424   }
5425 
5426   if (SrcReg == AArch64::NZCV) {
5427     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5428     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5429         .addImm(AArch64SysReg::NZCV)
5430         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5431     return;
5432   }
5433 
5434 #ifndef NDEBUG
5435   const TargetRegisterInfo &TRI = getRegisterInfo();
5436   errs() << TRI.getRegAsmName(DestReg) << " = COPY "
5437          << TRI.getRegAsmName(SrcReg) << "\n";
5438 #endif
5439   llvm_unreachable("unimplemented reg-to-reg copy");
5440 }
5441 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)5442 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5443                                     MachineBasicBlock &MBB,
5444                                     MachineBasicBlock::iterator InsertBefore,
5445                                     const MCInstrDesc &MCID,
5446                                     Register SrcReg, bool IsKill,
5447                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
5448                                     MachineMemOperand *MMO) {
5449   Register SrcReg0 = SrcReg;
5450   Register SrcReg1 = SrcReg;
5451   if (SrcReg.isPhysical()) {
5452     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5453     SubIdx0 = 0;
5454     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5455     SubIdx1 = 0;
5456   }
5457   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5458       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5459       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5460       .addFrameIndex(FI)
5461       .addImm(0)
5462       .addMemOperand(MMO);
5463 }
5464 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const5465 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
5466                                            MachineBasicBlock::iterator MBBI,
5467                                            Register SrcReg, bool isKill, int FI,
5468                                            const TargetRegisterClass *RC,
5469                                            const TargetRegisterInfo *TRI,
5470                                            Register VReg,
5471                                            MachineInstr::MIFlag Flags) const {
5472   MachineFunction &MF = *MBB.getParent();
5473   MachineFrameInfo &MFI = MF.getFrameInfo();
5474 
5475   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5476   MachineMemOperand *MMO =
5477       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
5478                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5479   unsigned Opc = 0;
5480   bool Offset = true;
5481   MCRegister PNRReg = MCRegister::NoRegister;
5482   unsigned StackID = TargetStackID::Default;
5483   switch (TRI->getSpillSize(*RC)) {
5484   case 1:
5485     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5486       Opc = AArch64::STRBui;
5487     break;
5488   case 2: {
5489     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5490       Opc = AArch64::STRHui;
5491     else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5492              AArch64::PPRRegClass.hasSubClassEq(RC)) {
5493       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5494              "Unexpected register store without SVE store instructions");
5495       Opc = AArch64::STR_PXI;
5496       StackID = TargetStackID::ScalableVector;
5497     }
5498     break;
5499   }
5500   case 4:
5501     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5502       Opc = AArch64::STRWui;
5503       if (SrcReg.isVirtual())
5504         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5505       else
5506         assert(SrcReg != AArch64::WSP);
5507     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5508       Opc = AArch64::STRSui;
5509     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5510       Opc = AArch64::STR_PPXI;
5511       StackID = TargetStackID::ScalableVector;
5512     }
5513     break;
5514   case 8:
5515     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5516       Opc = AArch64::STRXui;
5517       if (SrcReg.isVirtual())
5518         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5519       else
5520         assert(SrcReg != AArch64::SP);
5521     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5522       Opc = AArch64::STRDui;
5523     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5524       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5525                               get(AArch64::STPWi), SrcReg, isKill,
5526                               AArch64::sube32, AArch64::subo32, FI, MMO);
5527       return;
5528     }
5529     break;
5530   case 16:
5531     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5532       Opc = AArch64::STRQui;
5533     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5534       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5535       Opc = AArch64::ST1Twov1d;
5536       Offset = false;
5537     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5538       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5539                               get(AArch64::STPXi), SrcReg, isKill,
5540                               AArch64::sube64, AArch64::subo64, FI, MMO);
5541       return;
5542     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5543       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5544              "Unexpected register store without SVE store instructions");
5545       Opc = AArch64::STR_ZXI;
5546       StackID = TargetStackID::ScalableVector;
5547     } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
5548       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5549              "Unexpected predicate store without SVE store instructions");
5550       Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
5551       StackID = TargetStackID::ScalableVector;
5552     }
5553     break;
5554   case 24:
5555     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5556       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5557       Opc = AArch64::ST1Threev1d;
5558       Offset = false;
5559     }
5560     break;
5561   case 32:
5562     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5563       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5564       Opc = AArch64::ST1Fourv1d;
5565       Offset = false;
5566     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5567       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5568       Opc = AArch64::ST1Twov2d;
5569       Offset = false;
5570     } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5571       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5572              "Unexpected register store without SVE store instructions");
5573       Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
5574       StackID = TargetStackID::ScalableVector;
5575     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
5576       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5577              "Unexpected register store without SVE store instructions");
5578       Opc = AArch64::STR_ZZXI;
5579       StackID = TargetStackID::ScalableVector;
5580     }
5581     break;
5582   case 48:
5583     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5584       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5585       Opc = AArch64::ST1Threev2d;
5586       Offset = false;
5587     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5588       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5589              "Unexpected register store without SVE store instructions");
5590       Opc = AArch64::STR_ZZZXI;
5591       StackID = TargetStackID::ScalableVector;
5592     }
5593     break;
5594   case 64:
5595     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5596       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5597       Opc = AArch64::ST1Fourv2d;
5598       Offset = false;
5599     } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5600       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5601              "Unexpected register store without SVE store instructions");
5602       Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
5603       StackID = TargetStackID::ScalableVector;
5604     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
5605       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5606              "Unexpected register store without SVE store instructions");
5607       Opc = AArch64::STR_ZZZZXI;
5608       StackID = TargetStackID::ScalableVector;
5609     }
5610     break;
5611   }
5612   assert(Opc && "Unknown register class");
5613   MFI.setStackID(FI, StackID);
5614 
5615   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5616                                      .addReg(SrcReg, getKillRegState(isKill))
5617                                      .addFrameIndex(FI);
5618 
5619   if (Offset)
5620     MI.addImm(0);
5621   if (PNRReg.isValid())
5622     MI.addDef(PNRReg, RegState::Implicit);
5623   MI.addMemOperand(MMO);
5624 }
5625 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)5626 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
5627                                      MachineBasicBlock &MBB,
5628                                      MachineBasicBlock::iterator InsertBefore,
5629                                      const MCInstrDesc &MCID,
5630                                      Register DestReg, unsigned SubIdx0,
5631                                      unsigned SubIdx1, int FI,
5632                                      MachineMemOperand *MMO) {
5633   Register DestReg0 = DestReg;
5634   Register DestReg1 = DestReg;
5635   bool IsUndef = true;
5636   if (DestReg.isPhysical()) {
5637     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
5638     SubIdx0 = 0;
5639     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
5640     SubIdx1 = 0;
5641     IsUndef = false;
5642   }
5643   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5644       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
5645       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
5646       .addFrameIndex(FI)
5647       .addImm(0)
5648       .addMemOperand(MMO);
5649 }
5650 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const5651 void AArch64InstrInfo::loadRegFromStackSlot(
5652     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
5653     int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
5654     Register VReg, MachineInstr::MIFlag Flags) const {
5655   MachineFunction &MF = *MBB.getParent();
5656   MachineFrameInfo &MFI = MF.getFrameInfo();
5657   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5658   MachineMemOperand *MMO =
5659       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5660                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5661 
5662   unsigned Opc = 0;
5663   bool Offset = true;
5664   unsigned StackID = TargetStackID::Default;
5665   Register PNRReg = MCRegister::NoRegister;
5666   switch (TRI->getSpillSize(*RC)) {
5667   case 1:
5668     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5669       Opc = AArch64::LDRBui;
5670     break;
5671   case 2: {
5672     bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5673     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5674       Opc = AArch64::LDRHui;
5675     else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5676       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5677              "Unexpected register load without SVE load instructions");
5678       if (IsPNR)
5679         PNRReg = DestReg;
5680       Opc = AArch64::LDR_PXI;
5681       StackID = TargetStackID::ScalableVector;
5682     }
5683     break;
5684   }
5685   case 4:
5686     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5687       Opc = AArch64::LDRWui;
5688       if (DestReg.isVirtual())
5689         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5690       else
5691         assert(DestReg != AArch64::WSP);
5692     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5693       Opc = AArch64::LDRSui;
5694     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5695       Opc = AArch64::LDR_PPXI;
5696       StackID = TargetStackID::ScalableVector;
5697     }
5698     break;
5699   case 8:
5700     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5701       Opc = AArch64::LDRXui;
5702       if (DestReg.isVirtual())
5703         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5704       else
5705         assert(DestReg != AArch64::SP);
5706     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5707       Opc = AArch64::LDRDui;
5708     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5709       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5710                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
5711                                AArch64::subo32, FI, MMO);
5712       return;
5713     }
5714     break;
5715   case 16:
5716     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5717       Opc = AArch64::LDRQui;
5718     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5719       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5720       Opc = AArch64::LD1Twov1d;
5721       Offset = false;
5722     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5723       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5724                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
5725                                AArch64::subo64, FI, MMO);
5726       return;
5727     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5728       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5729              "Unexpected register load without SVE load instructions");
5730       Opc = AArch64::LDR_ZXI;
5731       StackID = TargetStackID::ScalableVector;
5732     } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
5733       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5734              "Unexpected predicate load without SVE load instructions");
5735       Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
5736       StackID = TargetStackID::ScalableVector;
5737     }
5738     break;
5739   case 24:
5740     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5741       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5742       Opc = AArch64::LD1Threev1d;
5743       Offset = false;
5744     }
5745     break;
5746   case 32:
5747     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5748       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5749       Opc = AArch64::LD1Fourv1d;
5750       Offset = false;
5751     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5752       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5753       Opc = AArch64::LD1Twov2d;
5754       Offset = false;
5755     } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5756       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5757              "Unexpected register load without SVE load instructions");
5758       Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
5759       StackID = TargetStackID::ScalableVector;
5760     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
5761       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5762              "Unexpected register load without SVE load instructions");
5763       Opc = AArch64::LDR_ZZXI;
5764       StackID = TargetStackID::ScalableVector;
5765     }
5766     break;
5767   case 48:
5768     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5769       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5770       Opc = AArch64::LD1Threev2d;
5771       Offset = false;
5772     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5773       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5774              "Unexpected register load without SVE load instructions");
5775       Opc = AArch64::LDR_ZZZXI;
5776       StackID = TargetStackID::ScalableVector;
5777     }
5778     break;
5779   case 64:
5780     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5781       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5782       Opc = AArch64::LD1Fourv2d;
5783       Offset = false;
5784     } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5785       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5786              "Unexpected register load without SVE load instructions");
5787       Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
5788       StackID = TargetStackID::ScalableVector;
5789     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
5790       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5791              "Unexpected register load without SVE load instructions");
5792       Opc = AArch64::LDR_ZZZZXI;
5793       StackID = TargetStackID::ScalableVector;
5794     }
5795     break;
5796   }
5797 
5798   assert(Opc && "Unknown register class");
5799   MFI.setStackID(FI, StackID);
5800 
5801   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5802                                      .addReg(DestReg, getDefRegState(true))
5803                                      .addFrameIndex(FI);
5804   if (Offset)
5805     MI.addImm(0);
5806   if (PNRReg.isValid() && !PNRReg.isVirtual())
5807     MI.addDef(PNRReg, RegState::Implicit);
5808   MI.addMemOperand(MMO);
5809 }
5810 
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5811 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5812                                            const MachineInstr &UseMI,
5813                                            const TargetRegisterInfo *TRI) {
5814   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5815                                          UseMI.getIterator()),
5816                 [TRI](const MachineInstr &I) {
5817                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
5818                          I.readsRegister(AArch64::NZCV, TRI);
5819                 });
5820 }
5821 
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5822 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5823     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5824   // The smallest scalable element supported by scaled SVE addressing
5825   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5826   // byte offset must always be a multiple of 2.
5827   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5828 
5829   // VGSized offsets are divided by '2', because the VG register is the
5830   // the number of 64bit granules as opposed to 128bit vector chunks,
5831   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5832   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5833   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5834   ByteSized = Offset.getFixed();
5835   VGSized = Offset.getScalable() / 2;
5836 }
5837 
5838 /// Returns the offset in parts to which this frame offset can be
5839 /// decomposed for the purpose of describing a frame offset.
5840 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5841 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5842     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5843     int64_t &NumDataVectors) {
5844   // The smallest scalable element supported by scaled SVE addressing
5845   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5846   // byte offset must always be a multiple of 2.
5847   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5848 
5849   NumBytes = Offset.getFixed();
5850   NumDataVectors = 0;
5851   NumPredicateVectors = Offset.getScalable() / 2;
5852   // This method is used to get the offsets to adjust the frame offset.
5853   // If the function requires ADDPL to be used and needs more than two ADDPL
5854   // instructions, part of the offset is folded into NumDataVectors so that it
5855   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5856   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5857       NumPredicateVectors > 62) {
5858     NumDataVectors = NumPredicateVectors / 8;
5859     NumPredicateVectors -= NumDataVectors * 8;
5860   }
5861 }
5862 
5863 // Convenience function to create a DWARF expression for
5864 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5865 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5866                                      int NumVGScaledBytes, unsigned VG,
5867                                      llvm::raw_string_ostream &Comment) {
5868   uint8_t buffer[16];
5869 
5870   if (NumBytes) {
5871     Expr.push_back(dwarf::DW_OP_consts);
5872     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5873     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5874     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5875   }
5876 
5877   if (NumVGScaledBytes) {
5878     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5879     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5880 
5881     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5882     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5883     Expr.push_back(0);
5884 
5885     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5886     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5887 
5888     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5889             << std::abs(NumVGScaledBytes) << " * VG";
5890   }
5891 }
5892 
5893 // Creates an MCCFIInstruction:
5894 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5895 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5896                                                unsigned Reg,
5897                                                const StackOffset &Offset) {
5898   int64_t NumBytes, NumVGScaledBytes;
5899   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5900                                                         NumVGScaledBytes);
5901   std::string CommentBuffer;
5902   llvm::raw_string_ostream Comment(CommentBuffer);
5903 
5904   if (Reg == AArch64::SP)
5905     Comment << "sp";
5906   else if (Reg == AArch64::FP)
5907     Comment << "fp";
5908   else
5909     Comment << printReg(Reg, &TRI);
5910 
5911   // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5912   SmallString<64> Expr;
5913   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5914   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5915   Expr.push_back(0);
5916   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5917                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5918 
5919   // Wrap this into DW_CFA_def_cfa.
5920   SmallString<64> DefCfaExpr;
5921   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5922   uint8_t buffer[16];
5923   DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5924   DefCfaExpr.append(Expr.str());
5925   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5926                                         Comment.str());
5927 }
5928 
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5929 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5930                                     unsigned FrameReg, unsigned Reg,
5931                                     const StackOffset &Offset,
5932                                     bool LastAdjustmentWasScalable) {
5933   if (Offset.getScalable())
5934     return createDefCFAExpression(TRI, Reg, Offset);
5935 
5936   if (FrameReg == Reg && !LastAdjustmentWasScalable)
5937     return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5938 
5939   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5940   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5941 }
5942 
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5943 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5944                                        unsigned Reg,
5945                                        const StackOffset &OffsetFromDefCFA) {
5946   int64_t NumBytes, NumVGScaledBytes;
5947   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5948       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5949 
5950   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5951 
5952   // Non-scalable offsets can use DW_CFA_offset directly.
5953   if (!NumVGScaledBytes)
5954     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5955 
5956   std::string CommentBuffer;
5957   llvm::raw_string_ostream Comment(CommentBuffer);
5958   Comment << printReg(Reg, &TRI) << "  @ cfa";
5959 
5960   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5961   SmallString<64> OffsetExpr;
5962   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5963                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5964 
5965   // Wrap this into DW_CFA_expression
5966   SmallString<64> CfaExpr;
5967   CfaExpr.push_back(dwarf::DW_CFA_expression);
5968   uint8_t buffer[16];
5969   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5970   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5971   CfaExpr.append(OffsetExpr.str());
5972 
5973   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5974                                         Comment.str());
5975 }
5976 
5977 // Helper function to emit a frame offset adjustment from a given
5978 // pointer (SrcReg), stored into DestReg. This function is explicit
5979 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5980 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5981                                MachineBasicBlock::iterator MBBI,
5982                                const DebugLoc &DL, unsigned DestReg,
5983                                unsigned SrcReg, int64_t Offset, unsigned Opc,
5984                                const TargetInstrInfo *TII,
5985                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5986                                bool *HasWinCFI, bool EmitCFAOffset,
5987                                StackOffset CFAOffset, unsigned FrameReg) {
5988   int Sign = 1;
5989   unsigned MaxEncoding, ShiftSize;
5990   switch (Opc) {
5991   case AArch64::ADDXri:
5992   case AArch64::ADDSXri:
5993   case AArch64::SUBXri:
5994   case AArch64::SUBSXri:
5995     MaxEncoding = 0xfff;
5996     ShiftSize = 12;
5997     break;
5998   case AArch64::ADDVL_XXI:
5999   case AArch64::ADDPL_XXI:
6000   case AArch64::ADDSVL_XXI:
6001   case AArch64::ADDSPL_XXI:
6002     MaxEncoding = 31;
6003     ShiftSize = 0;
6004     if (Offset < 0) {
6005       MaxEncoding = 32;
6006       Sign = -1;
6007       Offset = -Offset;
6008     }
6009     break;
6010   default:
6011     llvm_unreachable("Unsupported opcode");
6012   }
6013 
6014   // `Offset` can be in bytes or in "scalable bytes".
6015   int VScale = 1;
6016   if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6017     VScale = 16;
6018   else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6019     VScale = 2;
6020 
6021   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6022   // scratch register.  If DestReg is a virtual register, use it as the
6023   // scratch register; otherwise, create a new virtual register (to be
6024   // replaced by the scavenger at the end of PEI).  That case can be optimized
6025   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6026   // register can be loaded with offset%8 and the add/sub can use an extending
6027   // instruction with LSL#3.
6028   // Currently the function handles any offsets but generates a poor sequence
6029   // of code.
6030   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6031 
6032   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6033   Register TmpReg = DestReg;
6034   if (TmpReg == AArch64::XZR)
6035     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6036         &AArch64::GPR64RegClass);
6037   do {
6038     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6039     unsigned LocalShiftSize = 0;
6040     if (ThisVal > MaxEncoding) {
6041       ThisVal = ThisVal >> ShiftSize;
6042       LocalShiftSize = ShiftSize;
6043     }
6044     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6045            "Encoding cannot handle value that big");
6046 
6047     Offset -= ThisVal << LocalShiftSize;
6048     if (Offset == 0)
6049       TmpReg = DestReg;
6050     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6051                    .addReg(SrcReg)
6052                    .addImm(Sign * (int)ThisVal);
6053     if (ShiftSize)
6054       MBI = MBI.addImm(
6055           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
6056     MBI = MBI.setMIFlag(Flag);
6057 
6058     auto Change =
6059         VScale == 1
6060             ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6061             : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6062     if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6063       CFAOffset += Change;
6064     else
6065       CFAOffset -= Change;
6066     if (EmitCFAOffset && DestReg == TmpReg) {
6067       MachineFunction &MF = *MBB.getParent();
6068       const TargetSubtargetInfo &STI = MF.getSubtarget();
6069       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6070 
6071       unsigned CFIIndex = MF.addFrameInst(
6072           createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6073       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6074           .addCFIIndex(CFIIndex)
6075           .setMIFlags(Flag);
6076     }
6077 
6078     if (NeedsWinCFI) {
6079       int Imm = (int)(ThisVal << LocalShiftSize);
6080       if (VScale != 1 && DestReg == AArch64::SP) {
6081         if (HasWinCFI)
6082           *HasWinCFI = true;
6083         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6084             .addImm(ThisVal)
6085             .setMIFlag(Flag);
6086       } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6087                  (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6088         assert(VScale == 1 && "Expected non-scalable operation");
6089         if (HasWinCFI)
6090           *HasWinCFI = true;
6091         if (Imm == 0)
6092           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6093         else
6094           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6095               .addImm(Imm)
6096               .setMIFlag(Flag);
6097         assert(Offset == 0 && "Expected remaining offset to be zero to "
6098                               "emit a single SEH directive");
6099       } else if (DestReg == AArch64::SP) {
6100         assert(VScale == 1 && "Expected non-scalable operation");
6101         if (HasWinCFI)
6102           *HasWinCFI = true;
6103         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6104         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6105             .addImm(Imm)
6106             .setMIFlag(Flag);
6107       }
6108     }
6109 
6110     SrcReg = TmpReg;
6111   } while (Offset);
6112 }
6113 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)6114 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6115                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6116                            unsigned DestReg, unsigned SrcReg,
6117                            StackOffset Offset, const TargetInstrInfo *TII,
6118                            MachineInstr::MIFlag Flag, bool SetNZCV,
6119                            bool NeedsWinCFI, bool *HasWinCFI,
6120                            bool EmitCFAOffset, StackOffset CFAOffset,
6121                            unsigned FrameReg) {
6122   // If a function is marked as arm_locally_streaming, then the runtime value of
6123   // vscale in the prologue/epilogue is different the runtime value of vscale
6124   // in the function's body. To avoid having to consider multiple vscales,
6125   // we can use `addsvl` to allocate any scalable stack-slots, which under
6126   // most circumstances will be only locals, not callee-save slots.
6127   const Function &F = MBB.getParent()->getFunction();
6128   bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6129 
6130   int64_t Bytes, NumPredicateVectors, NumDataVectors;
6131   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6132       Offset, Bytes, NumPredicateVectors, NumDataVectors);
6133 
6134   // First emit non-scalable frame offsets, or a simple 'mov'.
6135   if (Bytes || (!Offset && SrcReg != DestReg)) {
6136     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6137            "SP increment/decrement not 8-byte aligned");
6138     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6139     if (Bytes < 0) {
6140       Bytes = -Bytes;
6141       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6142     }
6143     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6144                        NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6145                        FrameReg);
6146     CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6147                      ? StackOffset::getFixed(-Bytes)
6148                      : StackOffset::getFixed(Bytes);
6149     SrcReg = DestReg;
6150     FrameReg = DestReg;
6151   }
6152 
6153   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
6154          "SetNZCV not supported with SVE vectors");
6155   assert(!(NeedsWinCFI && NumPredicateVectors) &&
6156          "WinCFI can't allocate fractions of an SVE data vector");
6157 
6158   if (NumDataVectors) {
6159     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6160                        UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6161                        Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6162                        FrameReg);
6163     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6164     SrcReg = DestReg;
6165   }
6166 
6167   if (NumPredicateVectors) {
6168     assert(DestReg != AArch64::SP && "Unaligned access to SP");
6169     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6170                        UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6171                        Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6172                        FrameReg);
6173   }
6174 }
6175 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const6176 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
6177     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
6178     MachineBasicBlock::iterator InsertPt, int FrameIndex,
6179     LiveIntervals *LIS, VirtRegMap *VRM) const {
6180   // This is a bit of a hack. Consider this instruction:
6181   //
6182   //   %0 = COPY %sp; GPR64all:%0
6183   //
6184   // We explicitly chose GPR64all for the virtual register so such a copy might
6185   // be eliminated by RegisterCoalescer. However, that may not be possible, and
6186   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6187   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6188   //
6189   // To prevent that, we are going to constrain the %0 register class here.
6190   if (MI.isFullCopy()) {
6191     Register DstReg = MI.getOperand(0).getReg();
6192     Register SrcReg = MI.getOperand(1).getReg();
6193     if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6194       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6195       return nullptr;
6196     }
6197     if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6198       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6199       return nullptr;
6200     }
6201     // Nothing can folded with copy from/to NZCV.
6202     if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6203       return nullptr;
6204   }
6205 
6206   // Handle the case where a copy is being spilled or filled but the source
6207   // and destination register class don't match.  For example:
6208   //
6209   //   %0 = COPY %xzr; GPR64common:%0
6210   //
6211   // In this case we can still safely fold away the COPY and generate the
6212   // following spill code:
6213   //
6214   //   STRXui %xzr, %stack.0
6215   //
6216   // This also eliminates spilled cross register class COPYs (e.g. between x and
6217   // d regs) of the same size.  For example:
6218   //
6219   //   %0 = COPY %1; GPR64:%0, FPR64:%1
6220   //
6221   // will be filled as
6222   //
6223   //   LDRDui %0, fi<#0>
6224   //
6225   // instead of
6226   //
6227   //   LDRXui %Temp, fi<#0>
6228   //   %0 = FMOV %Temp
6229   //
6230   if (MI.isCopy() && Ops.size() == 1 &&
6231       // Make sure we're only folding the explicit COPY defs/uses.
6232       (Ops[0] == 0 || Ops[0] == 1)) {
6233     bool IsSpill = Ops[0] == 0;
6234     bool IsFill = !IsSpill;
6235     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6236     const MachineRegisterInfo &MRI = MF.getRegInfo();
6237     MachineBasicBlock &MBB = *MI.getParent();
6238     const MachineOperand &DstMO = MI.getOperand(0);
6239     const MachineOperand &SrcMO = MI.getOperand(1);
6240     Register DstReg = DstMO.getReg();
6241     Register SrcReg = SrcMO.getReg();
6242     // This is slightly expensive to compute for physical regs since
6243     // getMinimalPhysRegClass is slow.
6244     auto getRegClass = [&](unsigned Reg) {
6245       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6246                                               : TRI.getMinimalPhysRegClass(Reg);
6247     };
6248 
6249     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6250       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6251                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6252              "Mismatched register size in non subreg COPY");
6253       if (IsSpill)
6254         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6255                             getRegClass(SrcReg), &TRI, Register());
6256       else
6257         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6258                              getRegClass(DstReg), &TRI, Register());
6259       return &*--InsertPt;
6260     }
6261 
6262     // Handle cases like spilling def of:
6263     //
6264     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6265     //
6266     // where the physical register source can be widened and stored to the full
6267     // virtual reg destination stack slot, in this case producing:
6268     //
6269     //   STRXui %xzr, %stack.0
6270     //
6271     if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6272         TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6273       assert(SrcMO.getSubReg() == 0 &&
6274              "Unexpected subreg on physical register");
6275       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6276                           FrameIndex, &AArch64::GPR64RegClass, &TRI,
6277                           Register());
6278       return &*--InsertPt;
6279     }
6280 
6281     // Handle cases like filling use of:
6282     //
6283     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6284     //
6285     // where we can load the full virtual reg source stack slot, into the subreg
6286     // destination, in this case producing:
6287     //
6288     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
6289     //
6290     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6291       const TargetRegisterClass *FillRC;
6292       switch (DstMO.getSubReg()) {
6293       default:
6294         FillRC = nullptr;
6295         break;
6296       case AArch64::sub_32:
6297         FillRC = &AArch64::GPR32RegClass;
6298         break;
6299       case AArch64::ssub:
6300         FillRC = &AArch64::FPR32RegClass;
6301         break;
6302       case AArch64::dsub:
6303         FillRC = &AArch64::FPR64RegClass;
6304         break;
6305       }
6306 
6307       if (FillRC) {
6308         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6309                    TRI.getRegSizeInBits(*FillRC) &&
6310                "Mismatched regclass size on folded subreg COPY");
6311         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
6312                              Register());
6313         MachineInstr &LoadMI = *--InsertPt;
6314         MachineOperand &LoadDst = LoadMI.getOperand(0);
6315         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6316         LoadDst.setSubReg(DstMO.getSubReg());
6317         LoadDst.setIsUndef();
6318         return &LoadMI;
6319       }
6320     }
6321   }
6322 
6323   // Cannot fold.
6324   return nullptr;
6325 }
6326 
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)6327 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6328                                     StackOffset &SOffset,
6329                                     bool *OutUseUnscaledOp,
6330                                     unsigned *OutUnscaledOp,
6331                                     int64_t *EmittableOffset) {
6332   // Set output values in case of early exit.
6333   if (EmittableOffset)
6334     *EmittableOffset = 0;
6335   if (OutUseUnscaledOp)
6336     *OutUseUnscaledOp = false;
6337   if (OutUnscaledOp)
6338     *OutUnscaledOp = 0;
6339 
6340   // Exit early for structured vector spills/fills as they can't take an
6341   // immediate offset.
6342   switch (MI.getOpcode()) {
6343   default:
6344     break;
6345   case AArch64::LD1Rv1d:
6346   case AArch64::LD1Rv2s:
6347   case AArch64::LD1Rv2d:
6348   case AArch64::LD1Rv4h:
6349   case AArch64::LD1Rv4s:
6350   case AArch64::LD1Rv8b:
6351   case AArch64::LD1Rv8h:
6352   case AArch64::LD1Rv16b:
6353   case AArch64::LD1Twov2d:
6354   case AArch64::LD1Threev2d:
6355   case AArch64::LD1Fourv2d:
6356   case AArch64::LD1Twov1d:
6357   case AArch64::LD1Threev1d:
6358   case AArch64::LD1Fourv1d:
6359   case AArch64::ST1Twov2d:
6360   case AArch64::ST1Threev2d:
6361   case AArch64::ST1Fourv2d:
6362   case AArch64::ST1Twov1d:
6363   case AArch64::ST1Threev1d:
6364   case AArch64::ST1Fourv1d:
6365   case AArch64::ST1i8:
6366   case AArch64::ST1i16:
6367   case AArch64::ST1i32:
6368   case AArch64::ST1i64:
6369   case AArch64::IRG:
6370   case AArch64::IRGstack:
6371   case AArch64::STGloop:
6372   case AArch64::STZGloop:
6373     return AArch64FrameOffsetCannotUpdate;
6374   }
6375 
6376   // Get the min/max offset and the scale.
6377   TypeSize ScaleValue(0U, false), Width(0U, false);
6378   int64_t MinOff, MaxOff;
6379   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6380                                       MaxOff))
6381     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6382 
6383   // Construct the complete offset.
6384   bool IsMulVL = ScaleValue.isScalable();
6385   unsigned Scale = ScaleValue.getKnownMinValue();
6386   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6387 
6388   const MachineOperand &ImmOpnd =
6389       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6390   Offset += ImmOpnd.getImm() * Scale;
6391 
6392   // If the offset doesn't match the scale, we rewrite the instruction to
6393   // use the unscaled instruction instead. Likewise, if we have a negative
6394   // offset and there is an unscaled op to use.
6395   std::optional<unsigned> UnscaledOp =
6396       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
6397   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6398   if (useUnscaledOp &&
6399       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6400                                       MaxOff))
6401     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6402 
6403   Scale = ScaleValue.getKnownMinValue();
6404   assert(IsMulVL == ScaleValue.isScalable() &&
6405          "Unscaled opcode has different value for scalable");
6406 
6407   int64_t Remainder = Offset % Scale;
6408   assert(!(Remainder && useUnscaledOp) &&
6409          "Cannot have remainder when using unscaled op");
6410 
6411   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6412   int64_t NewOffset = Offset / Scale;
6413   if (MinOff <= NewOffset && NewOffset <= MaxOff)
6414     Offset = Remainder;
6415   else {
6416     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6417     Offset = Offset - (NewOffset * Scale);
6418   }
6419 
6420   if (EmittableOffset)
6421     *EmittableOffset = NewOffset;
6422   if (OutUseUnscaledOp)
6423     *OutUseUnscaledOp = useUnscaledOp;
6424   if (OutUnscaledOp && UnscaledOp)
6425     *OutUnscaledOp = *UnscaledOp;
6426 
6427   if (IsMulVL)
6428     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6429   else
6430     SOffset = StackOffset::get(Offset, SOffset.getScalable());
6431   return AArch64FrameOffsetCanUpdate |
6432          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6433 }
6434 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)6435 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
6436                                     unsigned FrameReg, StackOffset &Offset,
6437                                     const AArch64InstrInfo *TII) {
6438   unsigned Opcode = MI.getOpcode();
6439   unsigned ImmIdx = FrameRegIdx + 1;
6440 
6441   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6442     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6443     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6444                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6445                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6446     MI.eraseFromParent();
6447     Offset = StackOffset();
6448     return true;
6449   }
6450 
6451   int64_t NewOffset;
6452   unsigned UnscaledOp;
6453   bool UseUnscaledOp;
6454   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6455                                          &UnscaledOp, &NewOffset);
6456   if (Status & AArch64FrameOffsetCanUpdate) {
6457     if (Status & AArch64FrameOffsetIsLegal)
6458       // Replace the FrameIndex with FrameReg.
6459       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6460     if (UseUnscaledOp)
6461       MI.setDesc(TII->get(UnscaledOp));
6462 
6463     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6464     return !Offset;
6465   }
6466 
6467   return false;
6468 }
6469 
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const6470 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
6471                                   MachineBasicBlock::iterator MI) const {
6472   DebugLoc DL;
6473   BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
6474 }
6475 
getNop() const6476 MCInst AArch64InstrInfo::getNop() const {
6477   return MCInstBuilder(AArch64::HINT).addImm(0);
6478 }
6479 
6480 // AArch64 supports MachineCombiner.
useMachineCombiner() const6481 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6482 
6483 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)6484 static bool isCombineInstrSettingFlag(unsigned Opc) {
6485   switch (Opc) {
6486   case AArch64::ADDSWrr:
6487   case AArch64::ADDSWri:
6488   case AArch64::ADDSXrr:
6489   case AArch64::ADDSXri:
6490   case AArch64::SUBSWrr:
6491   case AArch64::SUBSXrr:
6492   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6493   case AArch64::SUBSWri:
6494   case AArch64::SUBSXri:
6495     return true;
6496   default:
6497     break;
6498   }
6499   return false;
6500 }
6501 
6502 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)6503 static bool isCombineInstrCandidate32(unsigned Opc) {
6504   switch (Opc) {
6505   case AArch64::ADDWrr:
6506   case AArch64::ADDWri:
6507   case AArch64::SUBWrr:
6508   case AArch64::ADDSWrr:
6509   case AArch64::ADDSWri:
6510   case AArch64::SUBSWrr:
6511   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6512   case AArch64::SUBWri:
6513   case AArch64::SUBSWri:
6514     return true;
6515   default:
6516     break;
6517   }
6518   return false;
6519 }
6520 
6521 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)6522 static bool isCombineInstrCandidate64(unsigned Opc) {
6523   switch (Opc) {
6524   case AArch64::ADDXrr:
6525   case AArch64::ADDXri:
6526   case AArch64::SUBXrr:
6527   case AArch64::ADDSXrr:
6528   case AArch64::ADDSXri:
6529   case AArch64::SUBSXrr:
6530   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6531   case AArch64::SUBXri:
6532   case AArch64::SUBSXri:
6533   case AArch64::ADDv8i8:
6534   case AArch64::ADDv16i8:
6535   case AArch64::ADDv4i16:
6536   case AArch64::ADDv8i16:
6537   case AArch64::ADDv2i32:
6538   case AArch64::ADDv4i32:
6539   case AArch64::SUBv8i8:
6540   case AArch64::SUBv16i8:
6541   case AArch64::SUBv4i16:
6542   case AArch64::SUBv8i16:
6543   case AArch64::SUBv2i32:
6544   case AArch64::SUBv4i32:
6545     return true;
6546   default:
6547     break;
6548   }
6549   return false;
6550 }
6551 
6552 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)6553 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
6554   switch (Inst.getOpcode()) {
6555   default:
6556     break;
6557   case AArch64::FADDHrr:
6558   case AArch64::FADDSrr:
6559   case AArch64::FADDDrr:
6560   case AArch64::FADDv4f16:
6561   case AArch64::FADDv8f16:
6562   case AArch64::FADDv2f32:
6563   case AArch64::FADDv2f64:
6564   case AArch64::FADDv4f32:
6565   case AArch64::FSUBHrr:
6566   case AArch64::FSUBSrr:
6567   case AArch64::FSUBDrr:
6568   case AArch64::FSUBv4f16:
6569   case AArch64::FSUBv8f16:
6570   case AArch64::FSUBv2f32:
6571   case AArch64::FSUBv2f64:
6572   case AArch64::FSUBv4f32:
6573     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
6574     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
6575     // the target options or if FADD/FSUB has the contract fast-math flag.
6576     return Options.UnsafeFPMath ||
6577            Options.AllowFPOpFusion == FPOpFusion::Fast ||
6578            Inst.getFlag(MachineInstr::FmContract);
6579     return true;
6580   }
6581   return false;
6582 }
6583 
6584 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)6585 static bool isCombineInstrCandidate(unsigned Opc) {
6586   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
6587 }
6588 
6589 //
6590 // Utility routine that checks if \param MO is defined by an
6591 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)6592 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
6593                        unsigned CombineOpc, unsigned ZeroReg = 0,
6594                        bool CheckZeroReg = false) {
6595   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6596   MachineInstr *MI = nullptr;
6597 
6598   if (MO.isReg() && MO.getReg().isVirtual())
6599     MI = MRI.getUniqueVRegDef(MO.getReg());
6600   // And it needs to be in the trace (otherwise, it won't have a depth).
6601   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
6602     return false;
6603   // Must only used by the user we combine with.
6604   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
6605     return false;
6606 
6607   if (CheckZeroReg) {
6608     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
6609            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
6610            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
6611     // The third input reg must be zero.
6612     if (MI->getOperand(3).getReg() != ZeroReg)
6613       return false;
6614   }
6615 
6616   if (isCombineInstrSettingFlag(CombineOpc) &&
6617       MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
6618     return false;
6619 
6620   return true;
6621 }
6622 
6623 //
6624 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)6625 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6626                               unsigned MulOpc, unsigned ZeroReg) {
6627   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
6628 }
6629 
6630 //
6631 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)6632 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6633                                unsigned MulOpc) {
6634   return canCombine(MBB, MO, MulOpc);
6635 }
6636 
6637 // TODO: There are many more machine instruction opcodes to match:
6638 //       1. Other data types (integer, vectors)
6639 //       2. Other math / logic operations (xor, or)
6640 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const6641 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
6642                                                    bool Invert) const {
6643   if (Invert)
6644     return false;
6645   switch (Inst.getOpcode()) {
6646   // == Floating-point types ==
6647   // -- Floating-point instructions --
6648   case AArch64::FADDHrr:
6649   case AArch64::FADDSrr:
6650   case AArch64::FADDDrr:
6651   case AArch64::FMULHrr:
6652   case AArch64::FMULSrr:
6653   case AArch64::FMULDrr:
6654   case AArch64::FMULX16:
6655   case AArch64::FMULX32:
6656   case AArch64::FMULX64:
6657   // -- Advanced SIMD instructions --
6658   case AArch64::FADDv4f16:
6659   case AArch64::FADDv8f16:
6660   case AArch64::FADDv2f32:
6661   case AArch64::FADDv4f32:
6662   case AArch64::FADDv2f64:
6663   case AArch64::FMULv4f16:
6664   case AArch64::FMULv8f16:
6665   case AArch64::FMULv2f32:
6666   case AArch64::FMULv4f32:
6667   case AArch64::FMULv2f64:
6668   case AArch64::FMULXv4f16:
6669   case AArch64::FMULXv8f16:
6670   case AArch64::FMULXv2f32:
6671   case AArch64::FMULXv4f32:
6672   case AArch64::FMULXv2f64:
6673   // -- SVE instructions --
6674   // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6675   // in the SVE instruction set (though there are predicated ones).
6676   case AArch64::FADD_ZZZ_H:
6677   case AArch64::FADD_ZZZ_S:
6678   case AArch64::FADD_ZZZ_D:
6679   case AArch64::FMUL_ZZZ_H:
6680   case AArch64::FMUL_ZZZ_S:
6681   case AArch64::FMUL_ZZZ_D:
6682     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6683            (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6684             Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6685 
6686   // == Integer types ==
6687   // -- Base instructions --
6688   // Opcodes MULWrr and MULXrr don't exist because
6689   // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6690   // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6691   // The machine-combiner does not support three-source-operands machine
6692   // instruction. So we cannot reassociate MULs.
6693   case AArch64::ADDWrr:
6694   case AArch64::ADDXrr:
6695   case AArch64::ANDWrr:
6696   case AArch64::ANDXrr:
6697   case AArch64::ORRWrr:
6698   case AArch64::ORRXrr:
6699   case AArch64::EORWrr:
6700   case AArch64::EORXrr:
6701   case AArch64::EONWrr:
6702   case AArch64::EONXrr:
6703   // -- Advanced SIMD instructions --
6704   // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6705   // in the Advanced SIMD instruction set.
6706   case AArch64::ADDv8i8:
6707   case AArch64::ADDv16i8:
6708   case AArch64::ADDv4i16:
6709   case AArch64::ADDv8i16:
6710   case AArch64::ADDv2i32:
6711   case AArch64::ADDv4i32:
6712   case AArch64::ADDv1i64:
6713   case AArch64::ADDv2i64:
6714   case AArch64::MULv8i8:
6715   case AArch64::MULv16i8:
6716   case AArch64::MULv4i16:
6717   case AArch64::MULv8i16:
6718   case AArch64::MULv2i32:
6719   case AArch64::MULv4i32:
6720   case AArch64::ANDv8i8:
6721   case AArch64::ANDv16i8:
6722   case AArch64::ORRv8i8:
6723   case AArch64::ORRv16i8:
6724   case AArch64::EORv8i8:
6725   case AArch64::EORv16i8:
6726   // -- SVE instructions --
6727   case AArch64::ADD_ZZZ_B:
6728   case AArch64::ADD_ZZZ_H:
6729   case AArch64::ADD_ZZZ_S:
6730   case AArch64::ADD_ZZZ_D:
6731   case AArch64::MUL_ZZZ_B:
6732   case AArch64::MUL_ZZZ_H:
6733   case AArch64::MUL_ZZZ_S:
6734   case AArch64::MUL_ZZZ_D:
6735   case AArch64::AND_ZZZ:
6736   case AArch64::ORR_ZZZ:
6737   case AArch64::EOR_ZZZ:
6738     return true;
6739 
6740   default:
6741     return false;
6742   }
6743 }
6744 
6745 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6746 static bool getMaddPatterns(MachineInstr &Root,
6747                             SmallVectorImpl<unsigned> &Patterns) {
6748   unsigned Opc = Root.getOpcode();
6749   MachineBasicBlock &MBB = *Root.getParent();
6750   bool Found = false;
6751 
6752   if (!isCombineInstrCandidate(Opc))
6753     return false;
6754   if (isCombineInstrSettingFlag(Opc)) {
6755     int Cmp_NZCV =
6756         Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6757     // When NZCV is live bail out.
6758     if (Cmp_NZCV == -1)
6759       return false;
6760     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6761     // When opcode can't change bail out.
6762     // CHECKME: do we miss any cases for opcode conversion?
6763     if (NewOpc == Opc)
6764       return false;
6765     Opc = NewOpc;
6766   }
6767 
6768   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6769                       unsigned Pattern) {
6770     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6771       Patterns.push_back(Pattern);
6772       Found = true;
6773     }
6774   };
6775 
6776   auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6777     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6778       Patterns.push_back(Pattern);
6779       Found = true;
6780     }
6781   };
6782 
6783   typedef AArch64MachineCombinerPattern MCP;
6784 
6785   switch (Opc) {
6786   default:
6787     break;
6788   case AArch64::ADDWrr:
6789     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6790            "ADDWrr does not have register operands");
6791     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6792     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6793     break;
6794   case AArch64::ADDXrr:
6795     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6796     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6797     break;
6798   case AArch64::SUBWrr:
6799     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6800     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6801     break;
6802   case AArch64::SUBXrr:
6803     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6804     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6805     break;
6806   case AArch64::ADDWri:
6807     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6808     break;
6809   case AArch64::ADDXri:
6810     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6811     break;
6812   case AArch64::SUBWri:
6813     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6814     break;
6815   case AArch64::SUBXri:
6816     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6817     break;
6818   case AArch64::ADDv8i8:
6819     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6820     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6821     break;
6822   case AArch64::ADDv16i8:
6823     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6824     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6825     break;
6826   case AArch64::ADDv4i16:
6827     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6828     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6829     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6830     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6831     break;
6832   case AArch64::ADDv8i16:
6833     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6834     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6835     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6836     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6837     break;
6838   case AArch64::ADDv2i32:
6839     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6840     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6841     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6842     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6843     break;
6844   case AArch64::ADDv4i32:
6845     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6846     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6847     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6848     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6849     break;
6850   case AArch64::SUBv8i8:
6851     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6852     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6853     break;
6854   case AArch64::SUBv16i8:
6855     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6856     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6857     break;
6858   case AArch64::SUBv4i16:
6859     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6860     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6861     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6862     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6863     break;
6864   case AArch64::SUBv8i16:
6865     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6866     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6867     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6868     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6869     break;
6870   case AArch64::SUBv2i32:
6871     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6872     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6873     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6874     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6875     break;
6876   case AArch64::SUBv4i32:
6877     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6878     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6879     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6880     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6881     break;
6882   }
6883   return Found;
6884 }
6885 
isAccumulationOpcode(unsigned Opcode) const6886 bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
6887   switch (Opcode) {
6888   default:
6889     break;
6890   case AArch64::UABALB_ZZZ_D:
6891   case AArch64::UABALB_ZZZ_H:
6892   case AArch64::UABALB_ZZZ_S:
6893   case AArch64::UABALT_ZZZ_D:
6894   case AArch64::UABALT_ZZZ_H:
6895   case AArch64::UABALT_ZZZ_S:
6896   case AArch64::SABALB_ZZZ_D:
6897   case AArch64::SABALB_ZZZ_S:
6898   case AArch64::SABALB_ZZZ_H:
6899   case AArch64::SABALT_ZZZ_D:
6900   case AArch64::SABALT_ZZZ_S:
6901   case AArch64::SABALT_ZZZ_H:
6902   case AArch64::UABALv16i8_v8i16:
6903   case AArch64::UABALv2i32_v2i64:
6904   case AArch64::UABALv4i16_v4i32:
6905   case AArch64::UABALv4i32_v2i64:
6906   case AArch64::UABALv8i16_v4i32:
6907   case AArch64::UABALv8i8_v8i16:
6908   case AArch64::UABAv16i8:
6909   case AArch64::UABAv2i32:
6910   case AArch64::UABAv4i16:
6911   case AArch64::UABAv4i32:
6912   case AArch64::UABAv8i16:
6913   case AArch64::UABAv8i8:
6914   case AArch64::SABALv16i8_v8i16:
6915   case AArch64::SABALv2i32_v2i64:
6916   case AArch64::SABALv4i16_v4i32:
6917   case AArch64::SABALv4i32_v2i64:
6918   case AArch64::SABALv8i16_v4i32:
6919   case AArch64::SABALv8i8_v8i16:
6920   case AArch64::SABAv16i8:
6921   case AArch64::SABAv2i32:
6922   case AArch64::SABAv4i16:
6923   case AArch64::SABAv4i32:
6924   case AArch64::SABAv8i16:
6925   case AArch64::SABAv8i8:
6926     return true;
6927   }
6928 
6929   return false;
6930 }
6931 
getAccumulationStartOpcode(unsigned AccumulationOpcode) const6932 unsigned AArch64InstrInfo::getAccumulationStartOpcode(
6933     unsigned AccumulationOpcode) const {
6934   switch (AccumulationOpcode) {
6935   default:
6936     llvm_unreachable("Unsupported accumulation Opcode!");
6937   case AArch64::UABALB_ZZZ_D:
6938     return AArch64::UABDLB_ZZZ_D;
6939   case AArch64::UABALB_ZZZ_H:
6940     return AArch64::UABDLB_ZZZ_H;
6941   case AArch64::UABALB_ZZZ_S:
6942     return AArch64::UABDLB_ZZZ_S;
6943   case AArch64::UABALT_ZZZ_D:
6944     return AArch64::UABDLT_ZZZ_D;
6945   case AArch64::UABALT_ZZZ_H:
6946     return AArch64::UABDLT_ZZZ_H;
6947   case AArch64::UABALT_ZZZ_S:
6948     return AArch64::UABDLT_ZZZ_S;
6949   case AArch64::UABALv16i8_v8i16:
6950     return AArch64::UABDLv16i8_v8i16;
6951   case AArch64::UABALv2i32_v2i64:
6952     return AArch64::UABDLv2i32_v2i64;
6953   case AArch64::UABALv4i16_v4i32:
6954     return AArch64::UABDLv4i16_v4i32;
6955   case AArch64::UABALv4i32_v2i64:
6956     return AArch64::UABDLv4i32_v2i64;
6957   case AArch64::UABALv8i16_v4i32:
6958     return AArch64::UABDLv8i16_v4i32;
6959   case AArch64::UABALv8i8_v8i16:
6960     return AArch64::UABDLv8i8_v8i16;
6961   case AArch64::UABAv16i8:
6962     return AArch64::UABDv16i8;
6963   case AArch64::UABAv2i32:
6964     return AArch64::UABDv2i32;
6965   case AArch64::UABAv4i16:
6966     return AArch64::UABDv4i16;
6967   case AArch64::UABAv4i32:
6968     return AArch64::UABDv4i32;
6969   case AArch64::UABAv8i16:
6970     return AArch64::UABDv8i16;
6971   case AArch64::UABAv8i8:
6972     return AArch64::UABDv8i8;
6973   case AArch64::SABALB_ZZZ_D:
6974     return AArch64::SABDLB_ZZZ_D;
6975   case AArch64::SABALB_ZZZ_S:
6976     return AArch64::SABDLB_ZZZ_S;
6977   case AArch64::SABALB_ZZZ_H:
6978     return AArch64::SABDLB_ZZZ_H;
6979   case AArch64::SABALT_ZZZ_D:
6980     return AArch64::SABDLT_ZZZ_D;
6981   case AArch64::SABALT_ZZZ_S:
6982     return AArch64::SABDLT_ZZZ_S;
6983   case AArch64::SABALT_ZZZ_H:
6984     return AArch64::SABDLT_ZZZ_H;
6985   case AArch64::SABALv16i8_v8i16:
6986     return AArch64::SABDLv16i8_v8i16;
6987   case AArch64::SABALv2i32_v2i64:
6988     return AArch64::SABDLv2i32_v2i64;
6989   case AArch64::SABALv4i16_v4i32:
6990     return AArch64::SABDLv4i16_v4i32;
6991   case AArch64::SABALv4i32_v2i64:
6992     return AArch64::SABDLv4i32_v2i64;
6993   case AArch64::SABALv8i16_v4i32:
6994     return AArch64::SABDLv8i16_v4i32;
6995   case AArch64::SABALv8i8_v8i16:
6996     return AArch64::SABDLv8i8_v8i16;
6997   case AArch64::SABAv16i8:
6998     return AArch64::SABDv16i8;
6999   case AArch64::SABAv2i32:
7000     return AArch64::SABAv2i32;
7001   case AArch64::SABAv4i16:
7002     return AArch64::SABDv4i16;
7003   case AArch64::SABAv4i32:
7004     return AArch64::SABDv4i32;
7005   case AArch64::SABAv8i16:
7006     return AArch64::SABDv8i16;
7007   case AArch64::SABAv8i8:
7008     return AArch64::SABDv8i8;
7009   }
7010 }
7011 
7012 /// Floating-Point Support
7013 
7014 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7015 static bool getFMAPatterns(MachineInstr &Root,
7016                            SmallVectorImpl<unsigned> &Patterns) {
7017 
7018   if (!isCombineInstrCandidateFP(Root))
7019     return false;
7020 
7021   MachineBasicBlock &MBB = *Root.getParent();
7022   bool Found = false;
7023 
7024   auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7025     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7026       Patterns.push_back(Pattern);
7027       return true;
7028     }
7029     return false;
7030   };
7031 
7032   typedef AArch64MachineCombinerPattern MCP;
7033 
7034   switch (Root.getOpcode()) {
7035   default:
7036     assert(false && "Unsupported FP instruction in combiner\n");
7037     break;
7038   case AArch64::FADDHrr:
7039     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7040            "FADDHrr does not have register operands");
7041 
7042     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7043     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7044     break;
7045   case AArch64::FADDSrr:
7046     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7047            "FADDSrr does not have register operands");
7048 
7049     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7050              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7051 
7052     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7053              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7054     break;
7055   case AArch64::FADDDrr:
7056     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7057              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7058 
7059     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7060              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7061     break;
7062   case AArch64::FADDv4f16:
7063     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7064              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7065 
7066     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7067              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7068     break;
7069   case AArch64::FADDv8f16:
7070     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7071              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7072 
7073     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7074              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7075     break;
7076   case AArch64::FADDv2f32:
7077     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7078              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7079 
7080     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7081              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7082     break;
7083   case AArch64::FADDv2f64:
7084     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7085              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7086 
7087     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7088              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7089     break;
7090   case AArch64::FADDv4f32:
7091     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7092              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7093 
7094     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7095              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7096     break;
7097   case AArch64::FSUBHrr:
7098     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7099     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7100     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7101     break;
7102   case AArch64::FSUBSrr:
7103     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7104 
7105     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7106              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7107 
7108     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7109     break;
7110   case AArch64::FSUBDrr:
7111     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7112 
7113     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7114              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7115 
7116     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7117     break;
7118   case AArch64::FSUBv4f16:
7119     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7120              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7121 
7122     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7123              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7124     break;
7125   case AArch64::FSUBv8f16:
7126     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7127              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7128 
7129     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7130              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7131     break;
7132   case AArch64::FSUBv2f32:
7133     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7134              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7135 
7136     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7137              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7138     break;
7139   case AArch64::FSUBv2f64:
7140     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7141              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7142 
7143     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7144              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7145     break;
7146   case AArch64::FSUBv4f32:
7147     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7148              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7149 
7150     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7151              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7152     break;
7153   }
7154   return Found;
7155 }
7156 
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7157 static bool getFMULPatterns(MachineInstr &Root,
7158                             SmallVectorImpl<unsigned> &Patterns) {
7159   MachineBasicBlock &MBB = *Root.getParent();
7160   bool Found = false;
7161 
7162   auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7163     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7164     MachineOperand &MO = Root.getOperand(Operand);
7165     MachineInstr *MI = nullptr;
7166     if (MO.isReg() && MO.getReg().isVirtual())
7167       MI = MRI.getUniqueVRegDef(MO.getReg());
7168     // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7169     if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7170         MI->getOperand(1).getReg().isVirtual())
7171       MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7172     if (MI && MI->getOpcode() == Opcode) {
7173       Patterns.push_back(Pattern);
7174       return true;
7175     }
7176     return false;
7177   };
7178 
7179   typedef AArch64MachineCombinerPattern MCP;
7180 
7181   switch (Root.getOpcode()) {
7182   default:
7183     return false;
7184   case AArch64::FMULv2f32:
7185     Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7186     Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7187     break;
7188   case AArch64::FMULv2f64:
7189     Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7190     Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7191     break;
7192   case AArch64::FMULv4f16:
7193     Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7194     Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7195     break;
7196   case AArch64::FMULv4f32:
7197     Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7198     Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7199     break;
7200   case AArch64::FMULv8f16:
7201     Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7202     Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7203     break;
7204   }
7205 
7206   return Found;
7207 }
7208 
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7209 static bool getFNEGPatterns(MachineInstr &Root,
7210                             SmallVectorImpl<unsigned> &Patterns) {
7211   unsigned Opc = Root.getOpcode();
7212   MachineBasicBlock &MBB = *Root.getParent();
7213   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7214 
7215   auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7216     MachineOperand &MO = Root.getOperand(1);
7217     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
7218     if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7219         MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7220         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
7221         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
7222         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
7223         MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7224       Patterns.push_back(Pattern);
7225       return true;
7226     }
7227     return false;
7228   };
7229 
7230   switch (Opc) {
7231   default:
7232     break;
7233   case AArch64::FNEGDr:
7234     return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7235   case AArch64::FNEGSr:
7236     return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7237   }
7238 
7239   return false;
7240 }
7241 
7242 /// Return true when a code sequence can improve throughput. It
7243 /// should be called only for instructions in loops.
7244 /// \param Pattern - combiner pattern
isThroughputPattern(unsigned Pattern) const7245 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7246   switch (Pattern) {
7247   default:
7248     break;
7249   case AArch64MachineCombinerPattern::FMULADDH_OP1:
7250   case AArch64MachineCombinerPattern::FMULADDH_OP2:
7251   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7252   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7253   case AArch64MachineCombinerPattern::FMULADDS_OP1:
7254   case AArch64MachineCombinerPattern::FMULADDS_OP2:
7255   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7256   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7257   case AArch64MachineCombinerPattern::FMULADDD_OP1:
7258   case AArch64MachineCombinerPattern::FMULADDD_OP2:
7259   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7260   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7261   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7262   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7263   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7264   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7265   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7266   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7267   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7268   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7269   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7270   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7271   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7272   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7273   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7274   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7275   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7276   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7277   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7278   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7279   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7280   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7281   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7282   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7283   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7284   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7285   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7286   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7287   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7288   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
7289   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7290   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
7291   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7292   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7293   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7294   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7295   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7296   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7297   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7298   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7299   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7300   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7301   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7302   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7303   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7304   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7305   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
7306   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7307   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
7308   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7309   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
7310   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7311   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
7312   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7313   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
7314   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7315   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7316   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7317   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7318   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7319   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7320   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7321   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7322   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7323   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7324   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7325   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7326   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7327   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7328   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7329   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7330   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7331   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7332   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7333   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7334   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7335   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7336   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7337   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7338   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7339   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7340   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7341   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7342   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7343   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7344   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7345   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7346   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7347   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7348   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7349   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7350   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7351   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7352   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7353   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7354     return true;
7355   } // end switch (Pattern)
7356   return false;
7357 }
7358 
7359 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7360 static bool getMiscPatterns(MachineInstr &Root,
7361                             SmallVectorImpl<unsigned> &Patterns) {
7362   // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
7363   unsigned Opc = Root.getOpcode();
7364   MachineBasicBlock &MBB = *Root.getParent();
7365 
7366   switch (Opc) {
7367   case AArch64::SUBWrr:
7368   case AArch64::SUBSWrr:
7369   case AArch64::SUBXrr:
7370   case AArch64::SUBSXrr:
7371     // Found candidate root.
7372     break;
7373   default:
7374     return false;
7375   }
7376 
7377   if (isCombineInstrSettingFlag(Opc) &&
7378       Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7379           -1)
7380     return false;
7381 
7382   if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7383       canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7384       canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7385       canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7386     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
7387     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
7388     return true;
7389   }
7390 
7391   return false;
7392 }
7393 
7394 CombinerObjective
getCombinerObjective(unsigned Pattern) const7395 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
7396   switch (Pattern) {
7397   case AArch64MachineCombinerPattern::SUBADD_OP1:
7398   case AArch64MachineCombinerPattern::SUBADD_OP2:
7399     return CombinerObjective::MustReduceDepth;
7400   default:
7401     return TargetInstrInfo::getCombinerObjective(Pattern);
7402   }
7403 }
7404 
7405 /// Return true when there is potentially a faster code sequence for an
7406 /// instruction chain ending in \p Root. All potential patterns are listed in
7407 /// the \p Pattern vector. Pattern should be sorted in priority order since the
7408 /// pattern evaluator stops checking as soon as it finds a faster sequence.
7409 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns,bool DoRegPressureReduce) const7410 bool AArch64InstrInfo::getMachineCombinerPatterns(
7411     MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
7412     bool DoRegPressureReduce) const {
7413   // Integer patterns
7414   if (getMaddPatterns(Root, Patterns))
7415     return true;
7416   // Floating point patterns
7417   if (getFMULPatterns(Root, Patterns))
7418     return true;
7419   if (getFMAPatterns(Root, Patterns))
7420     return true;
7421   if (getFNEGPatterns(Root, Patterns))
7422     return true;
7423 
7424   // Other patterns
7425   if (getMiscPatterns(Root, Patterns))
7426     return true;
7427 
7428   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
7429                                                      DoRegPressureReduce);
7430 }
7431 
7432 enum class FMAInstKind { Default, Indexed, Accumulator };
7433 /// genFusedMultiply - Generate fused multiply instructions.
7434 /// This function supports both integer and floating point instructions.
7435 /// A typical example:
7436 ///  F|MUL I=A,B,0
7437 ///  F|ADD R,I,C
7438 ///  ==> F|MADD R,A,B,C
7439 /// \param MF Containing MachineFunction
7440 /// \param MRI Register information
7441 /// \param TII Target information
7442 /// \param Root is the F|ADD instruction
7443 /// \param [out] InsInstrs is a vector of machine instructions and will
7444 /// contain the generated madd instruction
7445 /// \param IdxMulOpd is index of operand in Root that is the result of
7446 /// the F|MUL. In the example above IdxMulOpd is 1.
7447 /// \param MaddOpc the opcode fo the f|madd instruction
7448 /// \param RC Register class of operands
7449 /// \param kind of fma instruction (addressing mode) to be generated
7450 /// \param ReplacedAddend is the result register from the instruction
7451 /// replacing the non-combined operand, if any.
7452 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)7453 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
7454                  const TargetInstrInfo *TII, MachineInstr &Root,
7455                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
7456                  unsigned MaddOpc, const TargetRegisterClass *RC,
7457                  FMAInstKind kind = FMAInstKind::Default,
7458                  const Register *ReplacedAddend = nullptr) {
7459   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7460 
7461   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
7462   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7463   Register ResultReg = Root.getOperand(0).getReg();
7464   Register SrcReg0 = MUL->getOperand(1).getReg();
7465   bool Src0IsKill = MUL->getOperand(1).isKill();
7466   Register SrcReg1 = MUL->getOperand(2).getReg();
7467   bool Src1IsKill = MUL->getOperand(2).isKill();
7468 
7469   Register SrcReg2;
7470   bool Src2IsKill;
7471   if (ReplacedAddend) {
7472     // If we just generated a new addend, we must be it's only use.
7473     SrcReg2 = *ReplacedAddend;
7474     Src2IsKill = true;
7475   } else {
7476     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
7477     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
7478   }
7479 
7480   if (ResultReg.isVirtual())
7481     MRI.constrainRegClass(ResultReg, RC);
7482   if (SrcReg0.isVirtual())
7483     MRI.constrainRegClass(SrcReg0, RC);
7484   if (SrcReg1.isVirtual())
7485     MRI.constrainRegClass(SrcReg1, RC);
7486   if (SrcReg2.isVirtual())
7487     MRI.constrainRegClass(SrcReg2, RC);
7488 
7489   MachineInstrBuilder MIB;
7490   if (kind == FMAInstKind::Default)
7491     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7492               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7493               .addReg(SrcReg1, getKillRegState(Src1IsKill))
7494               .addReg(SrcReg2, getKillRegState(Src2IsKill));
7495   else if (kind == FMAInstKind::Indexed)
7496     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7497               .addReg(SrcReg2, getKillRegState(Src2IsKill))
7498               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7499               .addReg(SrcReg1, getKillRegState(Src1IsKill))
7500               .addImm(MUL->getOperand(3).getImm());
7501   else if (kind == FMAInstKind::Accumulator)
7502     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7503               .addReg(SrcReg2, getKillRegState(Src2IsKill))
7504               .addReg(SrcReg0, getKillRegState(Src0IsKill))
7505               .addReg(SrcReg1, getKillRegState(Src1IsKill));
7506   else
7507     assert(false && "Invalid FMA instruction kind \n");
7508   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
7509   InsInstrs.push_back(MIB);
7510   return MUL;
7511 }
7512 
7513 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)7514 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
7515                const TargetInstrInfo *TII, MachineInstr &Root,
7516                SmallVectorImpl<MachineInstr *> &InsInstrs) {
7517   MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7518 
7519   unsigned Opc = 0;
7520   const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
7521   if (AArch64::FPR32RegClass.hasSubClassEq(RC))
7522     Opc = AArch64::FNMADDSrrr;
7523   else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
7524     Opc = AArch64::FNMADDDrrr;
7525   else
7526     return nullptr;
7527 
7528   Register ResultReg = Root.getOperand(0).getReg();
7529   Register SrcReg0 = MAD->getOperand(1).getReg();
7530   Register SrcReg1 = MAD->getOperand(2).getReg();
7531   Register SrcReg2 = MAD->getOperand(3).getReg();
7532   bool Src0IsKill = MAD->getOperand(1).isKill();
7533   bool Src1IsKill = MAD->getOperand(2).isKill();
7534   bool Src2IsKill = MAD->getOperand(3).isKill();
7535   if (ResultReg.isVirtual())
7536     MRI.constrainRegClass(ResultReg, RC);
7537   if (SrcReg0.isVirtual())
7538     MRI.constrainRegClass(SrcReg0, RC);
7539   if (SrcReg1.isVirtual())
7540     MRI.constrainRegClass(SrcReg1, RC);
7541   if (SrcReg2.isVirtual())
7542     MRI.constrainRegClass(SrcReg2, RC);
7543 
7544   MachineInstrBuilder MIB =
7545       BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
7546           .addReg(SrcReg0, getKillRegState(Src0IsKill))
7547           .addReg(SrcReg1, getKillRegState(Src1IsKill))
7548           .addReg(SrcReg2, getKillRegState(Src2IsKill));
7549   InsInstrs.push_back(MIB);
7550 
7551   return MAD;
7552 }
7553 
7554 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
7555 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)7556 genIndexedMultiply(MachineInstr &Root,
7557                    SmallVectorImpl<MachineInstr *> &InsInstrs,
7558                    unsigned IdxDupOp, unsigned MulOpc,
7559                    const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
7560   assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
7561          "Invalid index of FMUL operand");
7562 
7563   MachineFunction &MF = *Root.getMF();
7564   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7565 
7566   MachineInstr *Dup =
7567       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
7568 
7569   if (Dup->getOpcode() == TargetOpcode::COPY)
7570     Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
7571 
7572   Register DupSrcReg = Dup->getOperand(1).getReg();
7573   MRI.clearKillFlags(DupSrcReg);
7574   MRI.constrainRegClass(DupSrcReg, RC);
7575 
7576   unsigned DupSrcLane = Dup->getOperand(2).getImm();
7577 
7578   unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
7579   MachineOperand &MulOp = Root.getOperand(IdxMulOp);
7580 
7581   Register ResultReg = Root.getOperand(0).getReg();
7582 
7583   MachineInstrBuilder MIB;
7584   MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
7585             .add(MulOp)
7586             .addReg(DupSrcReg)
7587             .addImm(DupSrcLane);
7588 
7589   InsInstrs.push_back(MIB);
7590   return &Root;
7591 }
7592 
7593 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
7594 /// instructions.
7595 ///
7596 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)7597 static MachineInstr *genFusedMultiplyAcc(
7598     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7599     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7600     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7601   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7602                           FMAInstKind::Accumulator);
7603 }
7604 
7605 /// genNeg - Helper to generate an intermediate negation of the second operand
7606 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)7607 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
7608                        const TargetInstrInfo *TII, MachineInstr &Root,
7609                        SmallVectorImpl<MachineInstr *> &InsInstrs,
7610                        DenseMap<Register, unsigned> &InstrIdxForVirtReg,
7611                        unsigned MnegOpc, const TargetRegisterClass *RC) {
7612   Register NewVR = MRI.createVirtualRegister(RC);
7613   MachineInstrBuilder MIB =
7614       BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
7615           .add(Root.getOperand(2));
7616   InsInstrs.push_back(MIB);
7617 
7618   assert(InstrIdxForVirtReg.empty());
7619   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7620 
7621   return NewVR;
7622 }
7623 
7624 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7625 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)7626 static MachineInstr *genFusedMultiplyAccNeg(
7627     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7628     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7629     DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7630     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7631   assert(IdxMulOpd == 1);
7632 
7633   Register NewVR =
7634       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7635   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7636                           FMAInstKind::Accumulator, &NewVR);
7637 }
7638 
7639 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
7640 /// instructions.
7641 ///
7642 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)7643 static MachineInstr *genFusedMultiplyIdx(
7644     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7645     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7646     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7647   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7648                           FMAInstKind::Indexed);
7649 }
7650 
7651 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7652 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)7653 static MachineInstr *genFusedMultiplyIdxNeg(
7654     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7655     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7656     DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7657     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7658   assert(IdxMulOpd == 1);
7659 
7660   Register NewVR =
7661       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7662 
7663   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7664                           FMAInstKind::Indexed, &NewVR);
7665 }
7666 
7667 /// genMaddR - Generate madd instruction and combine mul and add using
7668 /// an extra virtual register
7669 /// Example - an ADD intermediate needs to be stored in a register:
7670 ///   MUL I=A,B,0
7671 ///   ADD R,I,Imm
7672 ///   ==> ORR  V, ZR, Imm
7673 ///   ==> MADD R,A,B,V
7674 /// \param MF Containing MachineFunction
7675 /// \param MRI Register information
7676 /// \param TII Target information
7677 /// \param Root is the ADD instruction
7678 /// \param [out] InsInstrs is a vector of machine instructions and will
7679 /// contain the generated madd instruction
7680 /// \param IdxMulOpd is index of operand in Root that is the result of
7681 /// the MUL. In the example above IdxMulOpd is 1.
7682 /// \param MaddOpc the opcode fo the madd instruction
7683 /// \param VR is a virtual register that holds the value of an ADD operand
7684 /// (V in the example above).
7685 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)7686 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
7687                               const TargetInstrInfo *TII, MachineInstr &Root,
7688                               SmallVectorImpl<MachineInstr *> &InsInstrs,
7689                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
7690                               const TargetRegisterClass *RC) {
7691   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7692 
7693   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7694   Register ResultReg = Root.getOperand(0).getReg();
7695   Register SrcReg0 = MUL->getOperand(1).getReg();
7696   bool Src0IsKill = MUL->getOperand(1).isKill();
7697   Register SrcReg1 = MUL->getOperand(2).getReg();
7698   bool Src1IsKill = MUL->getOperand(2).isKill();
7699 
7700   if (ResultReg.isVirtual())
7701     MRI.constrainRegClass(ResultReg, RC);
7702   if (SrcReg0.isVirtual())
7703     MRI.constrainRegClass(SrcReg0, RC);
7704   if (SrcReg1.isVirtual())
7705     MRI.constrainRegClass(SrcReg1, RC);
7706   if (Register::isVirtualRegister(VR))
7707     MRI.constrainRegClass(VR, RC);
7708 
7709   MachineInstrBuilder MIB =
7710       BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7711           .addReg(SrcReg0, getKillRegState(Src0IsKill))
7712           .addReg(SrcReg1, getKillRegState(Src1IsKill))
7713           .addReg(VR);
7714   // Insert the MADD
7715   InsInstrs.push_back(MIB);
7716   return MUL;
7717 }
7718 
7719 /// Do the following transformation
7720 /// A - (B + C)  ==>   (A - B) - C
7721 /// A - (B + C)  ==>   (A - C) - B
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<Register,unsigned> & InstrIdxForVirtReg)7722 static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
7723                              const TargetInstrInfo *TII, MachineInstr &Root,
7724                              SmallVectorImpl<MachineInstr *> &InsInstrs,
7725                              SmallVectorImpl<MachineInstr *> &DelInstrs,
7726                              unsigned IdxOpd1,
7727                              DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
7728   assert(IdxOpd1 == 1 || IdxOpd1 == 2);
7729   unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
7730   MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
7731 
7732   Register ResultReg = Root.getOperand(0).getReg();
7733   Register RegA = Root.getOperand(1).getReg();
7734   bool RegAIsKill = Root.getOperand(1).isKill();
7735   Register RegB = AddMI->getOperand(IdxOpd1).getReg();
7736   bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
7737   Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
7738   bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
7739   Register NewVR =
7740       MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
7741 
7742   unsigned Opcode = Root.getOpcode();
7743   if (Opcode == AArch64::SUBSWrr)
7744     Opcode = AArch64::SUBWrr;
7745   else if (Opcode == AArch64::SUBSXrr)
7746     Opcode = AArch64::SUBXrr;
7747   else
7748     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
7749            "Unexpected instruction opcode.");
7750 
7751   uint32_t Flags = Root.mergeFlagsWith(*AddMI);
7752   Flags &= ~MachineInstr::NoSWrap;
7753   Flags &= ~MachineInstr::NoUWrap;
7754 
7755   MachineInstrBuilder MIB1 =
7756       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
7757           .addReg(RegA, getKillRegState(RegAIsKill))
7758           .addReg(RegB, getKillRegState(RegBIsKill))
7759           .setMIFlags(Flags);
7760   MachineInstrBuilder MIB2 =
7761       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
7762           .addReg(NewVR, getKillRegState(true))
7763           .addReg(RegC, getKillRegState(RegCIsKill))
7764           .setMIFlags(Flags);
7765 
7766   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7767   InsInstrs.push_back(MIB1);
7768   InsInstrs.push_back(MIB2);
7769   DelInstrs.push_back(AddMI);
7770   DelInstrs.push_back(&Root);
7771 }
7772 
getReduceOpcodeForAccumulator(unsigned int AccumulatorOpCode) const7773 unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
7774     unsigned int AccumulatorOpCode) const {
7775   switch (AccumulatorOpCode) {
7776   case AArch64::UABALB_ZZZ_D:
7777   case AArch64::SABALB_ZZZ_D:
7778   case AArch64::UABALT_ZZZ_D:
7779   case AArch64::SABALT_ZZZ_D:
7780     return AArch64::ADD_ZZZ_D;
7781   case AArch64::UABALB_ZZZ_H:
7782   case AArch64::SABALB_ZZZ_H:
7783   case AArch64::UABALT_ZZZ_H:
7784   case AArch64::SABALT_ZZZ_H:
7785     return AArch64::ADD_ZZZ_H;
7786   case AArch64::UABALB_ZZZ_S:
7787   case AArch64::SABALB_ZZZ_S:
7788   case AArch64::UABALT_ZZZ_S:
7789   case AArch64::SABALT_ZZZ_S:
7790     return AArch64::ADD_ZZZ_S;
7791   case AArch64::UABALv16i8_v8i16:
7792   case AArch64::SABALv8i8_v8i16:
7793   case AArch64::SABAv8i16:
7794   case AArch64::UABAv8i16:
7795     return AArch64::ADDv8i16;
7796   case AArch64::SABALv2i32_v2i64:
7797   case AArch64::UABALv2i32_v2i64:
7798   case AArch64::SABALv4i32_v2i64:
7799     return AArch64::ADDv2i64;
7800   case AArch64::UABALv4i16_v4i32:
7801   case AArch64::SABALv4i16_v4i32:
7802   case AArch64::SABALv8i16_v4i32:
7803   case AArch64::SABAv4i32:
7804   case AArch64::UABAv4i32:
7805     return AArch64::ADDv4i32;
7806   case AArch64::UABALv4i32_v2i64:
7807     return AArch64::ADDv2i64;
7808   case AArch64::UABALv8i16_v4i32:
7809     return AArch64::ADDv4i32;
7810   case AArch64::UABALv8i8_v8i16:
7811   case AArch64::SABALv16i8_v8i16:
7812     return AArch64::ADDv8i16;
7813   case AArch64::UABAv16i8:
7814   case AArch64::SABAv16i8:
7815     return AArch64::ADDv16i8;
7816   case AArch64::UABAv4i16:
7817   case AArch64::SABAv4i16:
7818     return AArch64::ADDv4i16;
7819   case AArch64::UABAv2i32:
7820   case AArch64::SABAv2i32:
7821     return AArch64::ADDv2i32;
7822   case AArch64::UABAv8i8:
7823   case AArch64::SABAv8i8:
7824     return AArch64::ADDv8i8;
7825   default:
7826     llvm_unreachable("Unknown accumulator opcode");
7827   }
7828 }
7829 
7830 /// When getMachineCombinerPatterns() finds potential patterns,
7831 /// this function generates the instructions that could replace the
7832 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,unsigned Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg) const7833 void AArch64InstrInfo::genAlternativeCodeSequence(
7834     MachineInstr &Root, unsigned Pattern,
7835     SmallVectorImpl<MachineInstr *> &InsInstrs,
7836     SmallVectorImpl<MachineInstr *> &DelInstrs,
7837     DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
7838   MachineBasicBlock &MBB = *Root.getParent();
7839   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7840   MachineFunction &MF = *MBB.getParent();
7841   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7842 
7843   MachineInstr *MUL = nullptr;
7844   const TargetRegisterClass *RC;
7845   unsigned Opc;
7846   switch (Pattern) {
7847   default:
7848     // Reassociate instructions.
7849     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
7850                                                 DelInstrs, InstrIdxForVirtReg);
7851     return;
7852   case AArch64MachineCombinerPattern::SUBADD_OP1:
7853     // A - (B + C)
7854     // ==> (A - B) - C
7855     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7856                      InstrIdxForVirtReg);
7857     return;
7858   case AArch64MachineCombinerPattern::SUBADD_OP2:
7859     // A - (B + C)
7860     // ==> (A - C) - B
7861     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7862                      InstrIdxForVirtReg);
7863     return;
7864   case AArch64MachineCombinerPattern::MULADDW_OP1:
7865   case AArch64MachineCombinerPattern::MULADDX_OP1:
7866     // MUL I=A,B,0
7867     // ADD R,I,C
7868     // ==> MADD R,A,B,C
7869     // --- Create(MADD);
7870     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7871       Opc = AArch64::MADDWrrr;
7872       RC = &AArch64::GPR32RegClass;
7873     } else {
7874       Opc = AArch64::MADDXrrr;
7875       RC = &AArch64::GPR64RegClass;
7876     }
7877     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7878     break;
7879   case AArch64MachineCombinerPattern::MULADDW_OP2:
7880   case AArch64MachineCombinerPattern::MULADDX_OP2:
7881     // MUL I=A,B,0
7882     // ADD R,C,I
7883     // ==> MADD R,A,B,C
7884     // --- Create(MADD);
7885     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7886       Opc = AArch64::MADDWrrr;
7887       RC = &AArch64::GPR32RegClass;
7888     } else {
7889       Opc = AArch64::MADDXrrr;
7890       RC = &AArch64::GPR64RegClass;
7891     }
7892     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7893     break;
7894   case AArch64MachineCombinerPattern::MULADDWI_OP1:
7895   case AArch64MachineCombinerPattern::MULADDXI_OP1:
7896   case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7897   case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7898     // MUL I=A,B,0
7899     // ADD/SUB R,I,Imm
7900     // ==> MOV V, Imm/-Imm
7901     // ==> MADD R,A,B,V
7902     // --- Create(MADD);
7903     const TargetRegisterClass *RC;
7904     unsigned BitSize, MovImm;
7905     if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
7906         Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7907       MovImm = AArch64::MOVi32imm;
7908       RC = &AArch64::GPR32spRegClass;
7909       BitSize = 32;
7910       Opc = AArch64::MADDWrrr;
7911       RC = &AArch64::GPR32RegClass;
7912     } else {
7913       MovImm = AArch64::MOVi64imm;
7914       RC = &AArch64::GPR64spRegClass;
7915       BitSize = 64;
7916       Opc = AArch64::MADDXrrr;
7917       RC = &AArch64::GPR64RegClass;
7918     }
7919     Register NewVR = MRI.createVirtualRegister(RC);
7920     uint64_t Imm = Root.getOperand(2).getImm();
7921 
7922     if (Root.getOperand(3).isImm()) {
7923       unsigned Val = Root.getOperand(3).getImm();
7924       Imm = Imm << Val;
7925     }
7926     bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
7927                  Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
7928     uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
7929     // Check that the immediate can be composed via a single instruction.
7930     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7931     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7932     if (Insn.size() != 1)
7933       return;
7934     MachineInstrBuilder MIB1 =
7935         BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
7936             .addImm(IsSub ? -Imm : Imm);
7937     InsInstrs.push_back(MIB1);
7938     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7939     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7940     break;
7941   }
7942   case AArch64MachineCombinerPattern::MULSUBW_OP1:
7943   case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7944     // MUL I=A,B,0
7945     // SUB R,I, C
7946     // ==> SUB  V, 0, C
7947     // ==> MADD R,A,B,V // = -C + A*B
7948     // --- Create(MADD);
7949     const TargetRegisterClass *SubRC;
7950     unsigned SubOpc, ZeroReg;
7951     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7952       SubOpc = AArch64::SUBWrr;
7953       SubRC = &AArch64::GPR32spRegClass;
7954       ZeroReg = AArch64::WZR;
7955       Opc = AArch64::MADDWrrr;
7956       RC = &AArch64::GPR32RegClass;
7957     } else {
7958       SubOpc = AArch64::SUBXrr;
7959       SubRC = &AArch64::GPR64spRegClass;
7960       ZeroReg = AArch64::XZR;
7961       Opc = AArch64::MADDXrrr;
7962       RC = &AArch64::GPR64RegClass;
7963     }
7964     Register NewVR = MRI.createVirtualRegister(SubRC);
7965     // SUB NewVR, 0, C
7966     MachineInstrBuilder MIB1 =
7967         BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7968             .addReg(ZeroReg)
7969             .add(Root.getOperand(2));
7970     InsInstrs.push_back(MIB1);
7971     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7972     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7973     break;
7974   }
7975   case AArch64MachineCombinerPattern::MULSUBW_OP2:
7976   case AArch64MachineCombinerPattern::MULSUBX_OP2:
7977     // MUL I=A,B,0
7978     // SUB R,C,I
7979     // ==> MSUB R,A,B,C (computes C - A*B)
7980     // --- Create(MSUB);
7981     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7982       Opc = AArch64::MSUBWrrr;
7983       RC = &AArch64::GPR32RegClass;
7984     } else {
7985       Opc = AArch64::MSUBXrrr;
7986       RC = &AArch64::GPR64RegClass;
7987     }
7988     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7989     break;
7990   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7991     Opc = AArch64::MLAv8i8;
7992     RC = &AArch64::FPR64RegClass;
7993     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7994     break;
7995   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7996     Opc = AArch64::MLAv8i8;
7997     RC = &AArch64::FPR64RegClass;
7998     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7999     break;
8000   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8001     Opc = AArch64::MLAv16i8;
8002     RC = &AArch64::FPR128RegClass;
8003     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8004     break;
8005   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8006     Opc = AArch64::MLAv16i8;
8007     RC = &AArch64::FPR128RegClass;
8008     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8009     break;
8010   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8011     Opc = AArch64::MLAv4i16;
8012     RC = &AArch64::FPR64RegClass;
8013     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8014     break;
8015   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8016     Opc = AArch64::MLAv4i16;
8017     RC = &AArch64::FPR64RegClass;
8018     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8019     break;
8020   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8021     Opc = AArch64::MLAv8i16;
8022     RC = &AArch64::FPR128RegClass;
8023     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8024     break;
8025   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8026     Opc = AArch64::MLAv8i16;
8027     RC = &AArch64::FPR128RegClass;
8028     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8029     break;
8030   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8031     Opc = AArch64::MLAv2i32;
8032     RC = &AArch64::FPR64RegClass;
8033     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8034     break;
8035   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8036     Opc = AArch64::MLAv2i32;
8037     RC = &AArch64::FPR64RegClass;
8038     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8039     break;
8040   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8041     Opc = AArch64::MLAv4i32;
8042     RC = &AArch64::FPR128RegClass;
8043     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8044     break;
8045   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8046     Opc = AArch64::MLAv4i32;
8047     RC = &AArch64::FPR128RegClass;
8048     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8049     break;
8050 
8051   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8052     Opc = AArch64::MLAv8i8;
8053     RC = &AArch64::FPR64RegClass;
8054     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8055                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8056                                  RC);
8057     break;
8058   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8059     Opc = AArch64::MLSv8i8;
8060     RC = &AArch64::FPR64RegClass;
8061     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8062     break;
8063   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
8064     Opc = AArch64::MLAv16i8;
8065     RC = &AArch64::FPR128RegClass;
8066     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8067                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
8068                                  RC);
8069     break;
8070   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
8071     Opc = AArch64::MLSv16i8;
8072     RC = &AArch64::FPR128RegClass;
8073     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8074     break;
8075   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
8076     Opc = AArch64::MLAv4i16;
8077     RC = &AArch64::FPR64RegClass;
8078     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8079                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8080                                  RC);
8081     break;
8082   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
8083     Opc = AArch64::MLSv4i16;
8084     RC = &AArch64::FPR64RegClass;
8085     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8086     break;
8087   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
8088     Opc = AArch64::MLAv8i16;
8089     RC = &AArch64::FPR128RegClass;
8090     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8091                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8092                                  RC);
8093     break;
8094   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
8095     Opc = AArch64::MLSv8i16;
8096     RC = &AArch64::FPR128RegClass;
8097     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8098     break;
8099   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
8100     Opc = AArch64::MLAv2i32;
8101     RC = &AArch64::FPR64RegClass;
8102     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8103                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8104                                  RC);
8105     break;
8106   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
8107     Opc = AArch64::MLSv2i32;
8108     RC = &AArch64::FPR64RegClass;
8109     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8110     break;
8111   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
8112     Opc = AArch64::MLAv4i32;
8113     RC = &AArch64::FPR128RegClass;
8114     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8115                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8116                                  RC);
8117     break;
8118   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
8119     Opc = AArch64::MLSv4i32;
8120     RC = &AArch64::FPR128RegClass;
8121     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8122     break;
8123 
8124   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
8125     Opc = AArch64::MLAv4i16_indexed;
8126     RC = &AArch64::FPR64RegClass;
8127     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8128     break;
8129   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
8130     Opc = AArch64::MLAv4i16_indexed;
8131     RC = &AArch64::FPR64RegClass;
8132     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8133     break;
8134   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
8135     Opc = AArch64::MLAv8i16_indexed;
8136     RC = &AArch64::FPR128RegClass;
8137     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8138     break;
8139   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
8140     Opc = AArch64::MLAv8i16_indexed;
8141     RC = &AArch64::FPR128RegClass;
8142     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8143     break;
8144   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
8145     Opc = AArch64::MLAv2i32_indexed;
8146     RC = &AArch64::FPR64RegClass;
8147     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8148     break;
8149   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
8150     Opc = AArch64::MLAv2i32_indexed;
8151     RC = &AArch64::FPR64RegClass;
8152     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8153     break;
8154   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
8155     Opc = AArch64::MLAv4i32_indexed;
8156     RC = &AArch64::FPR128RegClass;
8157     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8158     break;
8159   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
8160     Opc = AArch64::MLAv4i32_indexed;
8161     RC = &AArch64::FPR128RegClass;
8162     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8163     break;
8164 
8165   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
8166     Opc = AArch64::MLAv4i16_indexed;
8167     RC = &AArch64::FPR64RegClass;
8168     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8169                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8170                                  RC);
8171     break;
8172   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
8173     Opc = AArch64::MLSv4i16_indexed;
8174     RC = &AArch64::FPR64RegClass;
8175     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8176     break;
8177   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
8178     Opc = AArch64::MLAv8i16_indexed;
8179     RC = &AArch64::FPR128RegClass;
8180     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8181                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8182                                  RC);
8183     break;
8184   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
8185     Opc = AArch64::MLSv8i16_indexed;
8186     RC = &AArch64::FPR128RegClass;
8187     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8188     break;
8189   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
8190     Opc = AArch64::MLAv2i32_indexed;
8191     RC = &AArch64::FPR64RegClass;
8192     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8193                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8194                                  RC);
8195     break;
8196   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
8197     Opc = AArch64::MLSv2i32_indexed;
8198     RC = &AArch64::FPR64RegClass;
8199     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8200     break;
8201   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
8202     Opc = AArch64::MLAv4i32_indexed;
8203     RC = &AArch64::FPR128RegClass;
8204     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8205                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8206                                  RC);
8207     break;
8208   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
8209     Opc = AArch64::MLSv4i32_indexed;
8210     RC = &AArch64::FPR128RegClass;
8211     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8212     break;
8213 
8214   // Floating Point Support
8215   case AArch64MachineCombinerPattern::FMULADDH_OP1:
8216     Opc = AArch64::FMADDHrrr;
8217     RC = &AArch64::FPR16RegClass;
8218     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8219     break;
8220   case AArch64MachineCombinerPattern::FMULADDS_OP1:
8221     Opc = AArch64::FMADDSrrr;
8222     RC = &AArch64::FPR32RegClass;
8223     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8224     break;
8225   case AArch64MachineCombinerPattern::FMULADDD_OP1:
8226     Opc = AArch64::FMADDDrrr;
8227     RC = &AArch64::FPR64RegClass;
8228     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8229     break;
8230 
8231   case AArch64MachineCombinerPattern::FMULADDH_OP2:
8232     Opc = AArch64::FMADDHrrr;
8233     RC = &AArch64::FPR16RegClass;
8234     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8235     break;
8236   case AArch64MachineCombinerPattern::FMULADDS_OP2:
8237     Opc = AArch64::FMADDSrrr;
8238     RC = &AArch64::FPR32RegClass;
8239     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8240     break;
8241   case AArch64MachineCombinerPattern::FMULADDD_OP2:
8242     Opc = AArch64::FMADDDrrr;
8243     RC = &AArch64::FPR64RegClass;
8244     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8245     break;
8246 
8247   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
8248     Opc = AArch64::FMLAv1i32_indexed;
8249     RC = &AArch64::FPR32RegClass;
8250     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8251                            FMAInstKind::Indexed);
8252     break;
8253   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
8254     Opc = AArch64::FMLAv1i32_indexed;
8255     RC = &AArch64::FPR32RegClass;
8256     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8257                            FMAInstKind::Indexed);
8258     break;
8259 
8260   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
8261     Opc = AArch64::FMLAv1i64_indexed;
8262     RC = &AArch64::FPR64RegClass;
8263     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8264                            FMAInstKind::Indexed);
8265     break;
8266   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
8267     Opc = AArch64::FMLAv1i64_indexed;
8268     RC = &AArch64::FPR64RegClass;
8269     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8270                            FMAInstKind::Indexed);
8271     break;
8272 
8273   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
8274     RC = &AArch64::FPR64RegClass;
8275     Opc = AArch64::FMLAv4i16_indexed;
8276     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8277                            FMAInstKind::Indexed);
8278     break;
8279   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
8280     RC = &AArch64::FPR64RegClass;
8281     Opc = AArch64::FMLAv4f16;
8282     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8283                            FMAInstKind::Accumulator);
8284     break;
8285   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
8286     RC = &AArch64::FPR64RegClass;
8287     Opc = AArch64::FMLAv4i16_indexed;
8288     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8289                            FMAInstKind::Indexed);
8290     break;
8291   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
8292     RC = &AArch64::FPR64RegClass;
8293     Opc = AArch64::FMLAv4f16;
8294     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8295                            FMAInstKind::Accumulator);
8296     break;
8297 
8298   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
8299   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
8300     RC = &AArch64::FPR64RegClass;
8301     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
8302       Opc = AArch64::FMLAv2i32_indexed;
8303       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8304                              FMAInstKind::Indexed);
8305     } else {
8306       Opc = AArch64::FMLAv2f32;
8307       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8308                              FMAInstKind::Accumulator);
8309     }
8310     break;
8311   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
8312   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
8313     RC = &AArch64::FPR64RegClass;
8314     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
8315       Opc = AArch64::FMLAv2i32_indexed;
8316       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8317                              FMAInstKind::Indexed);
8318     } else {
8319       Opc = AArch64::FMLAv2f32;
8320       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8321                              FMAInstKind::Accumulator);
8322     }
8323     break;
8324 
8325   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
8326     RC = &AArch64::FPR128RegClass;
8327     Opc = AArch64::FMLAv8i16_indexed;
8328     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8329                            FMAInstKind::Indexed);
8330     break;
8331   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
8332     RC = &AArch64::FPR128RegClass;
8333     Opc = AArch64::FMLAv8f16;
8334     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8335                            FMAInstKind::Accumulator);
8336     break;
8337   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
8338     RC = &AArch64::FPR128RegClass;
8339     Opc = AArch64::FMLAv8i16_indexed;
8340     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8341                            FMAInstKind::Indexed);
8342     break;
8343   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
8344     RC = &AArch64::FPR128RegClass;
8345     Opc = AArch64::FMLAv8f16;
8346     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8347                            FMAInstKind::Accumulator);
8348     break;
8349 
8350   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
8351   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
8352     RC = &AArch64::FPR128RegClass;
8353     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
8354       Opc = AArch64::FMLAv2i64_indexed;
8355       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8356                              FMAInstKind::Indexed);
8357     } else {
8358       Opc = AArch64::FMLAv2f64;
8359       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8360                              FMAInstKind::Accumulator);
8361     }
8362     break;
8363   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
8364   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
8365     RC = &AArch64::FPR128RegClass;
8366     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
8367       Opc = AArch64::FMLAv2i64_indexed;
8368       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8369                              FMAInstKind::Indexed);
8370     } else {
8371       Opc = AArch64::FMLAv2f64;
8372       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8373                              FMAInstKind::Accumulator);
8374     }
8375     break;
8376 
8377   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
8378   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
8379     RC = &AArch64::FPR128RegClass;
8380     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
8381       Opc = AArch64::FMLAv4i32_indexed;
8382       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8383                              FMAInstKind::Indexed);
8384     } else {
8385       Opc = AArch64::FMLAv4f32;
8386       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8387                              FMAInstKind::Accumulator);
8388     }
8389     break;
8390 
8391   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
8392   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
8393     RC = &AArch64::FPR128RegClass;
8394     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
8395       Opc = AArch64::FMLAv4i32_indexed;
8396       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8397                              FMAInstKind::Indexed);
8398     } else {
8399       Opc = AArch64::FMLAv4f32;
8400       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8401                              FMAInstKind::Accumulator);
8402     }
8403     break;
8404 
8405   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
8406     Opc = AArch64::FNMSUBHrrr;
8407     RC = &AArch64::FPR16RegClass;
8408     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8409     break;
8410   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
8411     Opc = AArch64::FNMSUBSrrr;
8412     RC = &AArch64::FPR32RegClass;
8413     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8414     break;
8415   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
8416     Opc = AArch64::FNMSUBDrrr;
8417     RC = &AArch64::FPR64RegClass;
8418     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8419     break;
8420 
8421   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
8422     Opc = AArch64::FNMADDHrrr;
8423     RC = &AArch64::FPR16RegClass;
8424     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8425     break;
8426   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
8427     Opc = AArch64::FNMADDSrrr;
8428     RC = &AArch64::FPR32RegClass;
8429     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8430     break;
8431   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
8432     Opc = AArch64::FNMADDDrrr;
8433     RC = &AArch64::FPR64RegClass;
8434     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8435     break;
8436 
8437   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
8438     Opc = AArch64::FMSUBHrrr;
8439     RC = &AArch64::FPR16RegClass;
8440     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8441     break;
8442   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
8443     Opc = AArch64::FMSUBSrrr;
8444     RC = &AArch64::FPR32RegClass;
8445     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8446     break;
8447   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
8448     Opc = AArch64::FMSUBDrrr;
8449     RC = &AArch64::FPR64RegClass;
8450     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8451     break;
8452 
8453   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
8454     Opc = AArch64::FMLSv1i32_indexed;
8455     RC = &AArch64::FPR32RegClass;
8456     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8457                            FMAInstKind::Indexed);
8458     break;
8459 
8460   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
8461     Opc = AArch64::FMLSv1i64_indexed;
8462     RC = &AArch64::FPR64RegClass;
8463     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8464                            FMAInstKind::Indexed);
8465     break;
8466 
8467   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
8468   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
8469     RC = &AArch64::FPR64RegClass;
8470     Register NewVR = MRI.createVirtualRegister(RC);
8471     MachineInstrBuilder MIB1 =
8472         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
8473             .add(Root.getOperand(2));
8474     InsInstrs.push_back(MIB1);
8475     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8476     if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
8477       Opc = AArch64::FMLAv4f16;
8478       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8479                              FMAInstKind::Accumulator, &NewVR);
8480     } else {
8481       Opc = AArch64::FMLAv4i16_indexed;
8482       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8483                              FMAInstKind::Indexed, &NewVR);
8484     }
8485     break;
8486   }
8487   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
8488     RC = &AArch64::FPR64RegClass;
8489     Opc = AArch64::FMLSv4f16;
8490     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8491                            FMAInstKind::Accumulator);
8492     break;
8493   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
8494     RC = &AArch64::FPR64RegClass;
8495     Opc = AArch64::FMLSv4i16_indexed;
8496     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8497                            FMAInstKind::Indexed);
8498     break;
8499 
8500   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
8501   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
8502     RC = &AArch64::FPR64RegClass;
8503     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
8504       Opc = AArch64::FMLSv2i32_indexed;
8505       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8506                              FMAInstKind::Indexed);
8507     } else {
8508       Opc = AArch64::FMLSv2f32;
8509       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8510                              FMAInstKind::Accumulator);
8511     }
8512     break;
8513 
8514   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
8515   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
8516     RC = &AArch64::FPR128RegClass;
8517     Register NewVR = MRI.createVirtualRegister(RC);
8518     MachineInstrBuilder MIB1 =
8519         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
8520             .add(Root.getOperand(2));
8521     InsInstrs.push_back(MIB1);
8522     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8523     if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
8524       Opc = AArch64::FMLAv8f16;
8525       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8526                              FMAInstKind::Accumulator, &NewVR);
8527     } else {
8528       Opc = AArch64::FMLAv8i16_indexed;
8529       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8530                              FMAInstKind::Indexed, &NewVR);
8531     }
8532     break;
8533   }
8534   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
8535     RC = &AArch64::FPR128RegClass;
8536     Opc = AArch64::FMLSv8f16;
8537     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8538                            FMAInstKind::Accumulator);
8539     break;
8540   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
8541     RC = &AArch64::FPR128RegClass;
8542     Opc = AArch64::FMLSv8i16_indexed;
8543     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8544                            FMAInstKind::Indexed);
8545     break;
8546 
8547   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
8548   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
8549     RC = &AArch64::FPR128RegClass;
8550     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
8551       Opc = AArch64::FMLSv2i64_indexed;
8552       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8553                              FMAInstKind::Indexed);
8554     } else {
8555       Opc = AArch64::FMLSv2f64;
8556       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8557                              FMAInstKind::Accumulator);
8558     }
8559     break;
8560 
8561   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
8562   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
8563     RC = &AArch64::FPR128RegClass;
8564     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
8565       Opc = AArch64::FMLSv4i32_indexed;
8566       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8567                              FMAInstKind::Indexed);
8568     } else {
8569       Opc = AArch64::FMLSv4f32;
8570       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8571                              FMAInstKind::Accumulator);
8572     }
8573     break;
8574   case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
8575   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
8576     RC = &AArch64::FPR64RegClass;
8577     Register NewVR = MRI.createVirtualRegister(RC);
8578     MachineInstrBuilder MIB1 =
8579         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
8580             .add(Root.getOperand(2));
8581     InsInstrs.push_back(MIB1);
8582     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8583     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
8584       Opc = AArch64::FMLAv2i32_indexed;
8585       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8586                              FMAInstKind::Indexed, &NewVR);
8587     } else {
8588       Opc = AArch64::FMLAv2f32;
8589       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8590                              FMAInstKind::Accumulator, &NewVR);
8591     }
8592     break;
8593   }
8594   case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
8595   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
8596     RC = &AArch64::FPR128RegClass;
8597     Register NewVR = MRI.createVirtualRegister(RC);
8598     MachineInstrBuilder MIB1 =
8599         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
8600             .add(Root.getOperand(2));
8601     InsInstrs.push_back(MIB1);
8602     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8603     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
8604       Opc = AArch64::FMLAv4i32_indexed;
8605       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8606                              FMAInstKind::Indexed, &NewVR);
8607     } else {
8608       Opc = AArch64::FMLAv4f32;
8609       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8610                              FMAInstKind::Accumulator, &NewVR);
8611     }
8612     break;
8613   }
8614   case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
8615   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
8616     RC = &AArch64::FPR128RegClass;
8617     Register NewVR = MRI.createVirtualRegister(RC);
8618     MachineInstrBuilder MIB1 =
8619         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
8620             .add(Root.getOperand(2));
8621     InsInstrs.push_back(MIB1);
8622     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8623     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
8624       Opc = AArch64::FMLAv2i64_indexed;
8625       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8626                              FMAInstKind::Indexed, &NewVR);
8627     } else {
8628       Opc = AArch64::FMLAv2f64;
8629       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8630                              FMAInstKind::Accumulator, &NewVR);
8631     }
8632     break;
8633   }
8634   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
8635   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
8636     unsigned IdxDupOp =
8637         (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
8638                                                                           : 2;
8639     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
8640                        &AArch64::FPR128RegClass, MRI);
8641     break;
8642   }
8643   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
8644   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
8645     unsigned IdxDupOp =
8646         (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
8647                                                                           : 2;
8648     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
8649                        &AArch64::FPR128RegClass, MRI);
8650     break;
8651   }
8652   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
8653   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
8654     unsigned IdxDupOp =
8655         (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
8656                                                                           : 2;
8657     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
8658                        &AArch64::FPR128_loRegClass, MRI);
8659     break;
8660   }
8661   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
8662   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
8663     unsigned IdxDupOp =
8664         (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
8665                                                                           : 2;
8666     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
8667                        &AArch64::FPR128RegClass, MRI);
8668     break;
8669   }
8670   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
8671   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
8672     unsigned IdxDupOp =
8673         (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
8674                                                                           : 2;
8675     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
8676                        &AArch64::FPR128_loRegClass, MRI);
8677     break;
8678   }
8679   case AArch64MachineCombinerPattern::FNMADD: {
8680     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
8681     break;
8682   }
8683 
8684   } // end switch (Pattern)
8685   // Record MUL and ADD/SUB for deletion
8686   if (MUL)
8687     DelInstrs.push_back(MUL);
8688   DelInstrs.push_back(&Root);
8689 
8690   // Set the flags on the inserted instructions to be the merged flags of the
8691   // instructions that we have combined.
8692   uint32_t Flags = Root.getFlags();
8693   if (MUL)
8694     Flags = Root.mergeFlagsWith(*MUL);
8695   for (auto *MI : InsInstrs)
8696     MI->setFlags(Flags);
8697 }
8698 
8699 /// Replace csincr-branch sequence by simple conditional branch
8700 ///
8701 /// Examples:
8702 /// 1. \code
8703 ///   csinc  w9, wzr, wzr, <condition code>
8704 ///   tbnz   w9, #0, 0x44
8705 ///    \endcode
8706 /// to
8707 ///    \code
8708 ///   b.<inverted condition code>
8709 ///    \endcode
8710 ///
8711 /// 2. \code
8712 ///   csinc w9, wzr, wzr, <condition code>
8713 ///   tbz   w9, #0, 0x44
8714 ///    \endcode
8715 /// to
8716 ///    \code
8717 ///   b.<condition code>
8718 ///    \endcode
8719 ///
8720 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
8721 /// compare's constant operand is power of 2.
8722 ///
8723 /// Examples:
8724 ///    \code
8725 ///   and  w8, w8, #0x400
8726 ///   cbnz w8, L1
8727 ///    \endcode
8728 /// to
8729 ///    \code
8730 ///   tbnz w8, #10, L1
8731 ///    \endcode
8732 ///
8733 /// \param  MI Conditional Branch
8734 /// \return True when the simple conditional branch is generated
8735 ///
optimizeCondBranch(MachineInstr & MI) const8736 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
8737   bool IsNegativeBranch = false;
8738   bool IsTestAndBranch = false;
8739   unsigned TargetBBInMI = 0;
8740   switch (MI.getOpcode()) {
8741   default:
8742     llvm_unreachable("Unknown branch instruction?");
8743   case AArch64::Bcc:
8744   case AArch64::CBWPri:
8745   case AArch64::CBXPri:
8746   case AArch64::CBWPrr:
8747   case AArch64::CBXPrr:
8748     return false;
8749   case AArch64::CBZW:
8750   case AArch64::CBZX:
8751     TargetBBInMI = 1;
8752     break;
8753   case AArch64::CBNZW:
8754   case AArch64::CBNZX:
8755     TargetBBInMI = 1;
8756     IsNegativeBranch = true;
8757     break;
8758   case AArch64::TBZW:
8759   case AArch64::TBZX:
8760     TargetBBInMI = 2;
8761     IsTestAndBranch = true;
8762     break;
8763   case AArch64::TBNZW:
8764   case AArch64::TBNZX:
8765     TargetBBInMI = 2;
8766     IsNegativeBranch = true;
8767     IsTestAndBranch = true;
8768     break;
8769   }
8770   // So we increment a zero register and test for bits other
8771   // than bit 0? Conservatively bail out in case the verifier
8772   // missed this case.
8773   if (IsTestAndBranch && MI.getOperand(1).getImm())
8774     return false;
8775 
8776   // Find Definition.
8777   assert(MI.getParent() && "Incomplete machine instruction\n");
8778   MachineBasicBlock *MBB = MI.getParent();
8779   MachineFunction *MF = MBB->getParent();
8780   MachineRegisterInfo *MRI = &MF->getRegInfo();
8781   Register VReg = MI.getOperand(0).getReg();
8782   if (!VReg.isVirtual())
8783     return false;
8784 
8785   MachineInstr *DefMI = MRI->getVRegDef(VReg);
8786 
8787   // Look through COPY instructions to find definition.
8788   while (DefMI->isCopy()) {
8789     Register CopyVReg = DefMI->getOperand(1).getReg();
8790     if (!MRI->hasOneNonDBGUse(CopyVReg))
8791       return false;
8792     if (!MRI->hasOneDef(CopyVReg))
8793       return false;
8794     DefMI = MRI->getVRegDef(CopyVReg);
8795   }
8796 
8797   switch (DefMI->getOpcode()) {
8798   default:
8799     return false;
8800   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8801   case AArch64::ANDWri:
8802   case AArch64::ANDXri: {
8803     if (IsTestAndBranch)
8804       return false;
8805     if (DefMI->getParent() != MBB)
8806       return false;
8807     if (!MRI->hasOneNonDBGUse(VReg))
8808       return false;
8809 
8810     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8811     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8812         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8813     if (!isPowerOf2_64(Mask))
8814       return false;
8815 
8816     MachineOperand &MO = DefMI->getOperand(1);
8817     Register NewReg = MO.getReg();
8818     if (!NewReg.isVirtual())
8819       return false;
8820 
8821     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8822 
8823     MachineBasicBlock &RefToMBB = *MBB;
8824     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8825     DebugLoc DL = MI.getDebugLoc();
8826     unsigned Imm = Log2_64(Mask);
8827     unsigned Opc = (Imm < 32)
8828                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8829                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8830     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8831                               .addReg(NewReg)
8832                               .addImm(Imm)
8833                               .addMBB(TBB);
8834     // Register lives on to the CBZ now.
8835     MO.setIsKill(false);
8836 
8837     // For immediate smaller than 32, we need to use the 32-bit
8838     // variant (W) in all cases. Indeed the 64-bit variant does not
8839     // allow to encode them.
8840     // Therefore, if the input register is 64-bit, we need to take the
8841     // 32-bit sub-part.
8842     if (!Is32Bit && Imm < 32)
8843       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8844     MI.eraseFromParent();
8845     return true;
8846   }
8847   // Look for CSINC
8848   case AArch64::CSINCWr:
8849   case AArch64::CSINCXr: {
8850     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8851           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8852         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8853           DefMI->getOperand(2).getReg() == AArch64::XZR))
8854       return false;
8855 
8856     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8857                                          true) != -1)
8858       return false;
8859 
8860     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8861     // Convert only when the condition code is not modified between
8862     // the CSINC and the branch. The CC may be used by other
8863     // instructions in between.
8864     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8865       return false;
8866     MachineBasicBlock &RefToMBB = *MBB;
8867     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8868     DebugLoc DL = MI.getDebugLoc();
8869     if (IsNegativeBranch)
8870       CC = AArch64CC::getInvertedCondCode(CC);
8871     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8872     MI.eraseFromParent();
8873     return true;
8874   }
8875   }
8876 }
8877 
8878 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8879 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8880   const unsigned Mask = AArch64II::MO_FRAGMENT;
8881   return std::make_pair(TF & Mask, TF & ~Mask);
8882 }
8883 
8884 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8885 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8886   using namespace AArch64II;
8887 
8888   static const std::pair<unsigned, const char *> TargetFlags[] = {
8889       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8890       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8891       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8892       {MO_HI12, "aarch64-hi12"}};
8893   return ArrayRef(TargetFlags);
8894 }
8895 
8896 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8897 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8898   using namespace AArch64II;
8899 
8900   static const std::pair<unsigned, const char *> TargetFlags[] = {
8901       {MO_COFFSTUB, "aarch64-coffstub"},
8902       {MO_GOT, "aarch64-got"},
8903       {MO_NC, "aarch64-nc"},
8904       {MO_S, "aarch64-s"},
8905       {MO_TLS, "aarch64-tls"},
8906       {MO_DLLIMPORT, "aarch64-dllimport"},
8907       {MO_PREL, "aarch64-prel"},
8908       {MO_TAGGED, "aarch64-tagged"},
8909       {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8910   };
8911   return ArrayRef(TargetFlags);
8912 }
8913 
8914 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8915 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8916   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8917       {{MOSuppressPair, "aarch64-suppress-pair"},
8918        {MOStridedAccess, "aarch64-strided-access"}};
8919   return ArrayRef(TargetFlags);
8920 }
8921 
8922 /// Constants defining how certain sequences should be outlined.
8923 /// This encompasses how an outlined function should be called, and what kind of
8924 /// frame should be emitted for that outlined function.
8925 ///
8926 /// \p MachineOutlinerDefault implies that the function should be called with
8927 /// a save and restore of LR to the stack.
8928 ///
8929 /// That is,
8930 ///
8931 /// I1     Save LR                    OUTLINED_FUNCTION:
8932 /// I2 --> BL OUTLINED_FUNCTION       I1
8933 /// I3     Restore LR                 I2
8934 ///                                   I3
8935 ///                                   RET
8936 ///
8937 /// * Call construction overhead: 3 (save + BL + restore)
8938 /// * Frame construction overhead: 1 (ret)
8939 /// * Requires stack fixups? Yes
8940 ///
8941 /// \p MachineOutlinerTailCall implies that the function is being created from
8942 /// a sequence of instructions ending in a return.
8943 ///
8944 /// That is,
8945 ///
8946 /// I1                             OUTLINED_FUNCTION:
8947 /// I2 --> B OUTLINED_FUNCTION     I1
8948 /// RET                            I2
8949 ///                                RET
8950 ///
8951 /// * Call construction overhead: 1 (B)
8952 /// * Frame construction overhead: 0 (Return included in sequence)
8953 /// * Requires stack fixups? No
8954 ///
8955 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8956 /// a BL instruction, but doesn't require LR to be saved and restored. This
8957 /// happens when LR is known to be dead.
8958 ///
8959 /// That is,
8960 ///
8961 /// I1                                OUTLINED_FUNCTION:
8962 /// I2 --> BL OUTLINED_FUNCTION       I1
8963 /// I3                                I2
8964 ///                                   I3
8965 ///                                   RET
8966 ///
8967 /// * Call construction overhead: 1 (BL)
8968 /// * Frame construction overhead: 1 (RET)
8969 /// * Requires stack fixups? No
8970 ///
8971 /// \p MachineOutlinerThunk implies that the function is being created from
8972 /// a sequence of instructions ending in a call. The outlined function is
8973 /// called with a BL instruction, and the outlined function tail-calls the
8974 /// original call destination.
8975 ///
8976 /// That is,
8977 ///
8978 /// I1                                OUTLINED_FUNCTION:
8979 /// I2 --> BL OUTLINED_FUNCTION       I1
8980 /// BL f                              I2
8981 ///                                   B f
8982 /// * Call construction overhead: 1 (BL)
8983 /// * Frame construction overhead: 0
8984 /// * Requires stack fixups? No
8985 ///
8986 /// \p MachineOutlinerRegSave implies that the function should be called with a
8987 /// save and restore of LR to an available register. This allows us to avoid
8988 /// stack fixups. Note that this outlining variant is compatible with the
8989 /// NoLRSave case.
8990 ///
8991 /// That is,
8992 ///
8993 /// I1     Save LR                    OUTLINED_FUNCTION:
8994 /// I2 --> BL OUTLINED_FUNCTION       I1
8995 /// I3     Restore LR                 I2
8996 ///                                   I3
8997 ///                                   RET
8998 ///
8999 /// * Call construction overhead: 3 (save + BL + restore)
9000 /// * Frame construction overhead: 1 (ret)
9001 /// * Requires stack fixups? No
9002 enum MachineOutlinerClass {
9003   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
9004   MachineOutlinerTailCall, /// Only emit a branch.
9005   MachineOutlinerNoLRSave, /// Emit a call and return.
9006   MachineOutlinerThunk,    /// Emit a call and tail-call.
9007   MachineOutlinerRegSave   /// Same as default, but save to a register.
9008 };
9009 
9010 enum MachineOutlinerMBBFlags {
9011   LRUnavailableSomewhere = 0x2,
9012   HasCalls = 0x4,
9013   UnsafeRegsDead = 0x8
9014 };
9015 
9016 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const9017 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9018   MachineFunction *MF = C.getMF();
9019   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9020   const AArch64RegisterInfo *ARI =
9021       static_cast<const AArch64RegisterInfo *>(&TRI);
9022   // Check if there is an available register across the sequence that we can
9023   // use.
9024   for (unsigned Reg : AArch64::GPR64RegClass) {
9025     if (!ARI->isReservedReg(*MF, Reg) &&
9026         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
9027         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9028         Reg != AArch64::X17 && // Ditto for X17.
9029         C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9030         C.isAvailableInsideSeq(Reg, TRI))
9031       return Reg;
9032   }
9033   return Register();
9034 }
9035 
9036 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9037 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
9038                                          const outliner::Candidate &b) {
9039   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9040   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9041 
9042   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
9043          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
9044 }
9045 
9046 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9047 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
9048                                        const outliner::Candidate &b) {
9049   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9050   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9051 
9052   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9053 }
9054 
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9055 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
9056                                                 const outliner::Candidate &b) {
9057   const AArch64Subtarget &SubtargetA =
9058       a.getMF()->getSubtarget<AArch64Subtarget>();
9059   const AArch64Subtarget &SubtargetB =
9060       b.getMF()->getSubtarget<AArch64Subtarget>();
9061   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9062 }
9063 
9064 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
getOutliningCandidateInfo(const MachineModuleInfo & MMI,std::vector<outliner::Candidate> & RepeatedSequenceLocs,unsigned MinRepeats) const9065 AArch64InstrInfo::getOutliningCandidateInfo(
9066     const MachineModuleInfo &MMI,
9067     std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9068     unsigned MinRepeats) const {
9069   unsigned SequenceSize = 0;
9070   for (auto &MI : RepeatedSequenceLocs[0])
9071     SequenceSize += getInstSizeInBytes(MI);
9072 
9073   unsigned NumBytesToCreateFrame = 0;
9074 
9075   // We only allow outlining for functions having exactly matching return
9076   // address signing attributes, i.e., all share the same value for the
9077   // attribute "sign-return-address" and all share the same type of key they
9078   // are signed with.
9079   // Additionally we require all functions to simultaneously either support
9080   // v8.3a features or not. Otherwise an outlined function could get signed
9081   // using dedicated v8.3 instructions and a call from a function that doesn't
9082   // support v8.3 instructions would therefore be invalid.
9083   if (std::adjacent_find(
9084           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
9085           [](const outliner::Candidate &a, const outliner::Candidate &b) {
9086             // Return true if a and b are non-equal w.r.t. return address
9087             // signing or support of v8.3a features
9088             if (outliningCandidatesSigningScopeConsensus(a, b) &&
9089                 outliningCandidatesSigningKeyConsensus(a, b) &&
9090                 outliningCandidatesV8_3OpsConsensus(a, b)) {
9091               return false;
9092             }
9093             return true;
9094           }) != RepeatedSequenceLocs.end()) {
9095     return std::nullopt;
9096   }
9097 
9098   // Since at this point all candidates agree on their return address signing
9099   // picking just one is fine. If the candidate functions potentially sign their
9100   // return addresses, the outlined function should do the same. Note that in
9101   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
9102   // not certainly true that the outlined function will have to sign its return
9103   // address but this decision is made later, when the decision to outline
9104   // has already been made.
9105   // The same holds for the number of additional instructions we need: On
9106   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
9107   // necessary. However, at this point we don't know if the outlined function
9108   // will have a RET instruction so we assume the worst.
9109   const TargetRegisterInfo &TRI = getRegisterInfo();
9110   // Performing a tail call may require extra checks when PAuth is enabled.
9111   // If PAuth is disabled, set it to zero for uniformity.
9112   unsigned NumBytesToCheckLRInTCEpilogue = 0;
9113   if (RepeatedSequenceLocs[0]
9114           .getMF()
9115           ->getInfo<AArch64FunctionInfo>()
9116           ->shouldSignReturnAddress(true)) {
9117     // One PAC and one AUT instructions
9118     NumBytesToCreateFrame += 8;
9119 
9120     // PAuth is enabled - set extra tail call cost, if any.
9121     auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
9122         *RepeatedSequenceLocs[0].getMF());
9123     NumBytesToCheckLRInTCEpilogue =
9124         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
9125     // Checking the authenticated LR value may significantly impact
9126     // SequenceSize, so account for it for more precise results.
9127     if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
9128       SequenceSize += NumBytesToCheckLRInTCEpilogue;
9129 
9130     // We have to check if sp modifying instructions would get outlined.
9131     // If so we only allow outlining if sp is unchanged overall, so matching
9132     // sub and add instructions are okay to outline, all other sp modifications
9133     // are not
9134     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
9135       int SPValue = 0;
9136       for (auto &MI : C) {
9137         if (MI.modifiesRegister(AArch64::SP, &TRI)) {
9138           switch (MI.getOpcode()) {
9139           case AArch64::ADDXri:
9140           case AArch64::ADDWri:
9141             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9142             assert(MI.getOperand(2).isImm() &&
9143                    "Expected operand to be immediate");
9144             assert(MI.getOperand(1).isReg() &&
9145                    "Expected operand to be a register");
9146             // Check if the add just increments sp. If so, we search for
9147             // matching sub instructions that decrement sp. If not, the
9148             // modification is illegal
9149             if (MI.getOperand(1).getReg() == AArch64::SP)
9150               SPValue += MI.getOperand(2).getImm();
9151             else
9152               return true;
9153             break;
9154           case AArch64::SUBXri:
9155           case AArch64::SUBWri:
9156             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9157             assert(MI.getOperand(2).isImm() &&
9158                    "Expected operand to be immediate");
9159             assert(MI.getOperand(1).isReg() &&
9160                    "Expected operand to be a register");
9161             // Check if the sub just decrements sp. If so, we search for
9162             // matching add instructions that increment sp. If not, the
9163             // modification is illegal
9164             if (MI.getOperand(1).getReg() == AArch64::SP)
9165               SPValue -= MI.getOperand(2).getImm();
9166             else
9167               return true;
9168             break;
9169           default:
9170             return true;
9171           }
9172         }
9173       }
9174       if (SPValue)
9175         return true;
9176       return false;
9177     };
9178     // Remove candidates with illegal stack modifying instructions
9179     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
9180 
9181     // If the sequence doesn't have enough candidates left, then we're done.
9182     if (RepeatedSequenceLocs.size() < MinRepeats)
9183       return std::nullopt;
9184   }
9185 
9186   // Properties about candidate MBBs that hold for all of them.
9187   unsigned FlagsSetInAll = 0xF;
9188 
9189   // Compute liveness information for each candidate, and set FlagsSetInAll.
9190   for (outliner::Candidate &C : RepeatedSequenceLocs)
9191     FlagsSetInAll &= C.Flags;
9192 
9193   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
9194 
9195   // Helper lambda which sets call information for every candidate.
9196   auto SetCandidateCallInfo =
9197       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
9198         for (outliner::Candidate &C : RepeatedSequenceLocs)
9199           C.setCallInfo(CallID, NumBytesForCall);
9200       };
9201 
9202   unsigned FrameID = MachineOutlinerDefault;
9203   NumBytesToCreateFrame += 4;
9204 
9205   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
9206     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
9207   });
9208 
9209   // We check to see if CFI Instructions are present, and if they are
9210   // we find the number of CFI Instructions in the candidates.
9211   unsigned CFICount = 0;
9212   for (auto &I : RepeatedSequenceLocs[0]) {
9213     if (I.isCFIInstruction())
9214       CFICount++;
9215   }
9216 
9217   // We compare the number of found CFI Instructions to  the number of CFI
9218   // instructions in the parent function for each candidate.  We must check this
9219   // since if we outline one of the CFI instructions in a function, we have to
9220   // outline them all for correctness. If we do not, the address offsets will be
9221   // incorrect between the two sections of the program.
9222   for (outliner::Candidate &C : RepeatedSequenceLocs) {
9223     std::vector<MCCFIInstruction> CFIInstructions =
9224         C.getMF()->getFrameInstructions();
9225 
9226     if (CFICount > 0 && CFICount != CFIInstructions.size())
9227       return std::nullopt;
9228   }
9229 
9230   // Returns true if an instructions is safe to fix up, false otherwise.
9231   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
9232     if (MI.isCall())
9233       return true;
9234 
9235     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
9236         !MI.readsRegister(AArch64::SP, &TRI))
9237       return true;
9238 
9239     // Any modification of SP will break our code to save/restore LR.
9240     // FIXME: We could handle some instructions which add a constant
9241     // offset to SP, with a bit more work.
9242     if (MI.modifiesRegister(AArch64::SP, &TRI))
9243       return false;
9244 
9245     // At this point, we have a stack instruction that we might need to
9246     // fix up. We'll handle it if it's a load or store.
9247     if (MI.mayLoadOrStore()) {
9248       const MachineOperand *Base; // Filled with the base operand of MI.
9249       int64_t Offset;             // Filled with the offset of MI.
9250       bool OffsetIsScalable;
9251 
9252       // Does it allow us to offset the base operand and is the base the
9253       // register SP?
9254       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
9255           !Base->isReg() || Base->getReg() != AArch64::SP)
9256         return false;
9257 
9258       // Fixe-up code below assumes bytes.
9259       if (OffsetIsScalable)
9260         return false;
9261 
9262       // Find the minimum/maximum offset for this instruction and check
9263       // if fixing it up would be in range.
9264       int64_t MinOffset,
9265           MaxOffset;  // Unscaled offsets for the instruction.
9266       // The scale to multiply the offsets by.
9267       TypeSize Scale(0U, false), DummyWidth(0U, false);
9268       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
9269 
9270       Offset += 16; // Update the offset to what it would be if we outlined.
9271       if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
9272           Offset > MaxOffset * (int64_t)Scale.getFixedValue())
9273         return false;
9274 
9275       // It's in range, so we can outline it.
9276       return true;
9277     }
9278 
9279     // FIXME: Add handling for instructions like "add x0, sp, #8".
9280 
9281     // We can't fix it up, so don't outline it.
9282     return false;
9283   };
9284 
9285   // True if it's possible to fix up each stack instruction in this sequence.
9286   // Important for frames/call variants that modify the stack.
9287   bool AllStackInstrsSafe =
9288       llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
9289 
9290   // If the last instruction in any candidate is a terminator, then we should
9291   // tail call all of the candidates.
9292   if (RepeatedSequenceLocs[0].back().isTerminator()) {
9293     FrameID = MachineOutlinerTailCall;
9294     NumBytesToCreateFrame = 0;
9295     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
9296     SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
9297   }
9298 
9299   else if (LastInstrOpcode == AArch64::BL ||
9300            ((LastInstrOpcode == AArch64::BLR ||
9301              LastInstrOpcode == AArch64::BLRNoIP) &&
9302             !HasBTI)) {
9303     // FIXME: Do we need to check if the code after this uses the value of LR?
9304     FrameID = MachineOutlinerThunk;
9305     NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
9306     SetCandidateCallInfo(MachineOutlinerThunk, 4);
9307   }
9308 
9309   else {
9310     // We need to decide how to emit calls + frames. We can always emit the same
9311     // frame if we don't need to save to the stack. If we have to save to the
9312     // stack, then we need a different frame.
9313     unsigned NumBytesNoStackCalls = 0;
9314     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
9315 
9316     // Check if we have to save LR.
9317     for (outliner::Candidate &C : RepeatedSequenceLocs) {
9318       bool LRAvailable =
9319           (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
9320               ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
9321               : true;
9322       // If we have a noreturn caller, then we're going to be conservative and
9323       // say that we have to save LR. If we don't have a ret at the end of the
9324       // block, then we can't reason about liveness accurately.
9325       //
9326       // FIXME: We can probably do better than always disabling this in
9327       // noreturn functions by fixing up the liveness info.
9328       bool IsNoReturn =
9329           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
9330 
9331       // Is LR available? If so, we don't need a save.
9332       if (LRAvailable && !IsNoReturn) {
9333         NumBytesNoStackCalls += 4;
9334         C.setCallInfo(MachineOutlinerNoLRSave, 4);
9335         CandidatesWithoutStackFixups.push_back(C);
9336       }
9337 
9338       // Is an unused register available? If so, we won't modify the stack, so
9339       // we can outline with the same frame type as those that don't save LR.
9340       else if (findRegisterToSaveLRTo(C)) {
9341         NumBytesNoStackCalls += 12;
9342         C.setCallInfo(MachineOutlinerRegSave, 12);
9343         CandidatesWithoutStackFixups.push_back(C);
9344       }
9345 
9346       // Is SP used in the sequence at all? If not, we don't have to modify
9347       // the stack, so we are guaranteed to get the same frame.
9348       else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
9349         NumBytesNoStackCalls += 12;
9350         C.setCallInfo(MachineOutlinerDefault, 12);
9351         CandidatesWithoutStackFixups.push_back(C);
9352       }
9353 
9354       // If we outline this, we need to modify the stack. Pretend we don't
9355       // outline this by saving all of its bytes.
9356       else {
9357         NumBytesNoStackCalls += SequenceSize;
9358       }
9359     }
9360 
9361     // If there are no places where we have to save LR, then note that we
9362     // don't have to update the stack. Otherwise, give every candidate the
9363     // default call type, as long as it's safe to do so.
9364     if (!AllStackInstrsSafe ||
9365         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
9366       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
9367       FrameID = MachineOutlinerNoLRSave;
9368       if (RepeatedSequenceLocs.size() < MinRepeats)
9369         return std::nullopt;
9370     } else {
9371       SetCandidateCallInfo(MachineOutlinerDefault, 12);
9372 
9373       // Bugzilla ID: 46767
9374       // TODO: Check if fixing up the stack more than once is safe so we can
9375       // outline these.
9376       //
9377       // An outline resulting in a caller that requires stack fixups at the
9378       // callsite to a callee that also requires stack fixups can happen when
9379       // there are no available registers at the candidate callsite for a
9380       // candidate that itself also has calls.
9381       //
9382       // In other words if function_containing_sequence in the following pseudo
9383       // assembly requires that we save LR at the point of the call, but there
9384       // are no available registers: in this case we save using SP and as a
9385       // result the SP offsets requires stack fixups by multiples of 16.
9386       //
9387       // function_containing_sequence:
9388       //   ...
9389       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9390       //   call OUTLINED_FUNCTION_N
9391       //   restore LR from SP
9392       //   ...
9393       //
9394       // OUTLINED_FUNCTION_N:
9395       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9396       //   ...
9397       //   bl foo
9398       //   restore LR from SP
9399       //   ret
9400       //
9401       // Because the code to handle more than one stack fixup does not
9402       // currently have the proper checks for legality, these cases will assert
9403       // in the AArch64 MachineOutliner. This is because the code to do this
9404       // needs more hardening, testing, better checks that generated code is
9405       // legal, etc and because it is only verified to handle a single pass of
9406       // stack fixup.
9407       //
9408       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
9409       // these cases until they are known to be handled. Bugzilla 46767 is
9410       // referenced in comments at the assert site.
9411       //
9412       // To avoid asserting (or generating non-legal code on noassert builds)
9413       // we remove all candidates which would need more than one stack fixup by
9414       // pruning the cases where the candidate has calls while also having no
9415       // available LR and having no available general purpose registers to copy
9416       // LR to (ie one extra stack save/restore).
9417       //
9418       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9419         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
9420           auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
9421           return (llvm::any_of(C, IsCall)) &&
9422                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
9423                   !findRegisterToSaveLRTo(C));
9424         });
9425       }
9426     }
9427 
9428     // If we dropped all of the candidates, bail out here.
9429     if (RepeatedSequenceLocs.size() < MinRepeats)
9430       return std::nullopt;
9431   }
9432 
9433   // Does every candidate's MBB contain a call? If so, then we might have a call
9434   // in the range.
9435   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9436     // Check if the range contains a call. These require a save + restore of the
9437     // link register.
9438     outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
9439     bool ModStackToSaveLR = false;
9440     if (any_of(drop_end(FirstCand),
9441                [](const MachineInstr &MI) { return MI.isCall(); }))
9442       ModStackToSaveLR = true;
9443 
9444     // Handle the last instruction separately. If this is a tail call, then the
9445     // last instruction is a call. We don't want to save + restore in this case.
9446     // However, it could be possible that the last instruction is a call without
9447     // it being valid to tail call this sequence. We should consider this as
9448     // well.
9449     else if (FrameID != MachineOutlinerThunk &&
9450              FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
9451       ModStackToSaveLR = true;
9452 
9453     if (ModStackToSaveLR) {
9454       // We can't fix up the stack. Bail out.
9455       if (!AllStackInstrsSafe)
9456         return std::nullopt;
9457 
9458       // Save + restore LR.
9459       NumBytesToCreateFrame += 8;
9460     }
9461   }
9462 
9463   // If we have CFI instructions, we can only outline if the outlined section
9464   // can be a tail call
9465   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
9466     return std::nullopt;
9467 
9468   return std::make_unique<outliner::OutlinedFunction>(
9469       RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
9470 }
9471 
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const9472 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
9473     Function &F, std::vector<outliner::Candidate> &Candidates) const {
9474   // If a bunch of candidates reach this point they must agree on their return
9475   // address signing. It is therefore enough to just consider the signing
9476   // behaviour of one of them
9477   const auto &CFn = Candidates.front().getMF()->getFunction();
9478 
9479   if (CFn.hasFnAttribute("ptrauth-returns"))
9480     F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
9481   if (CFn.hasFnAttribute("ptrauth-auth-traps"))
9482     F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
9483   // Since all candidates belong to the same module, just copy the
9484   // function-level attributes of an arbitrary function.
9485   if (CFn.hasFnAttribute("sign-return-address"))
9486     F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
9487   if (CFn.hasFnAttribute("sign-return-address-key"))
9488     F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
9489 
9490   AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
9491 }
9492 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const9493 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
9494     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
9495   const Function &F = MF.getFunction();
9496 
9497   // Can F be deduplicated by the linker? If it can, don't outline from it.
9498   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
9499     return false;
9500 
9501   // Don't outline from functions with section markings; the program could
9502   // expect that all the code is in the named section.
9503   // FIXME: Allow outlining from multiple functions with the same section
9504   // marking.
9505   if (F.hasSection())
9506     return false;
9507 
9508   // Outlining from functions with redzones is unsafe since the outliner may
9509   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
9510   // outline from it.
9511   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
9512   if (!AFI || AFI->hasRedZone().value_or(true))
9513     return false;
9514 
9515   // FIXME: Determine whether it is safe to outline from functions which contain
9516   // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
9517   // outlined together and ensure it is safe to outline with async unwind info,
9518   // required for saving & restoring VG around calls.
9519   if (AFI->hasStreamingModeChanges())
9520     return false;
9521 
9522   // FIXME: Teach the outliner to generate/handle Windows unwind info.
9523   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
9524     return false;
9525 
9526   // It's safe to outline from MF.
9527   return true;
9528 }
9529 
9530 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const9531 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
9532                                       unsigned &Flags) const {
9533   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
9534          "Must track liveness!");
9535   SmallVector<
9536       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9537       Ranges;
9538   // According to the AArch64 Procedure Call Standard, the following are
9539   // undefined on entry/exit from a function call:
9540   //
9541   // * Registers x16, x17, (and thus w16, w17)
9542   // * Condition codes (and thus the NZCV register)
9543   //
9544   // If any of these registers are used inside or live across an outlined
9545   // function, then they may be modified later, either by the compiler or
9546   // some other tool (like the linker).
9547   //
9548   // To avoid outlining in these situations, partition each block into ranges
9549   // where these registers are dead. We will only outline from those ranges.
9550   LiveRegUnits LRU(getRegisterInfo());
9551   auto AreAllUnsafeRegsDead = [&LRU]() {
9552     return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
9553            LRU.available(AArch64::NZCV);
9554   };
9555 
9556   // We need to know if LR is live across an outlining boundary later on in
9557   // order to decide how we'll create the outlined call, frame, etc.
9558   //
9559   // It's pretty expensive to check this for *every candidate* within a block.
9560   // That's some potentially n^2 behaviour, since in the worst case, we'd need
9561   // to compute liveness from the end of the block for O(n) candidates within
9562   // the block.
9563   //
9564   // So, to improve the average case, let's keep track of liveness from the end
9565   // of the block to the beginning of *every outlinable range*. If we know that
9566   // LR is available in every range we could outline from, then we know that
9567   // we don't need to check liveness for any candidate within that range.
9568   bool LRAvailableEverywhere = true;
9569   // Compute liveness bottom-up.
9570   LRU.addLiveOuts(MBB);
9571   // Update flags that require info about the entire MBB.
9572   auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
9573     if (MI.isCall() && !MI.isTerminator())
9574       Flags |= MachineOutlinerMBBFlags::HasCalls;
9575   };
9576   // Range: [RangeBegin, RangeEnd)
9577   MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
9578   unsigned RangeLen;
9579   auto CreateNewRangeStartingAt =
9580       [&RangeBegin, &RangeEnd,
9581        &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
9582         RangeBegin = NewBegin;
9583         RangeEnd = std::next(RangeBegin);
9584         RangeLen = 0;
9585       };
9586   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
9587     // At least one unsafe register is not dead. We do not want to outline at
9588     // this point. If it is long enough to outline from, save the range
9589     // [RangeBegin, RangeEnd).
9590     if (RangeLen > 1)
9591       Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
9592   };
9593   // Find the first point where all unsafe registers are dead.
9594   // FIND: <safe instr> <-- end of first potential range
9595   // SKIP: <unsafe def>
9596   // SKIP: ... everything between ...
9597   // SKIP: <unsafe use>
9598   auto FirstPossibleEndPt = MBB.instr_rbegin();
9599   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
9600     LRU.stepBackward(*FirstPossibleEndPt);
9601     // Update flags that impact how we outline across the entire block,
9602     // regardless of safety.
9603     UpdateWholeMBBFlags(*FirstPossibleEndPt);
9604     if (AreAllUnsafeRegsDead())
9605       break;
9606   }
9607   // If we exhausted the entire block, we have no safe ranges to outline.
9608   if (FirstPossibleEndPt == MBB.instr_rend())
9609     return Ranges;
9610   // Current range.
9611   CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
9612   // StartPt points to the first place where all unsafe registers
9613   // are dead (if there is any such point). Begin partitioning the MBB into
9614   // ranges.
9615   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
9616     LRU.stepBackward(MI);
9617     UpdateWholeMBBFlags(MI);
9618     if (!AreAllUnsafeRegsDead()) {
9619       SaveRangeIfNonEmpty();
9620       CreateNewRangeStartingAt(MI.getIterator());
9621       continue;
9622     }
9623     LRAvailableEverywhere &= LRU.available(AArch64::LR);
9624     RangeBegin = MI.getIterator();
9625     ++RangeLen;
9626   }
9627   // Above loop misses the last (or only) range. If we are still safe, then
9628   // let's save the range.
9629   if (AreAllUnsafeRegsDead())
9630     SaveRangeIfNonEmpty();
9631   if (Ranges.empty())
9632     return Ranges;
9633   // We found the ranges bottom-up. Mapping expects the top-down. Reverse
9634   // the order.
9635   std::reverse(Ranges.begin(), Ranges.end());
9636   // If there is at least one outlinable range where LR is unavailable
9637   // somewhere, remember that.
9638   if (!LRAvailableEverywhere)
9639     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
9640   return Ranges;
9641 }
9642 
9643 outliner::InstrType
getOutliningTypeImpl(const MachineModuleInfo & MMI,MachineBasicBlock::iterator & MIT,unsigned Flags) const9644 AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
9645                                        MachineBasicBlock::iterator &MIT,
9646                                        unsigned Flags) const {
9647   MachineInstr &MI = *MIT;
9648 
9649   // Don't outline anything used for return address signing. The outlined
9650   // function will get signed later if needed
9651   switch (MI.getOpcode()) {
9652   case AArch64::PACM:
9653   case AArch64::PACIASP:
9654   case AArch64::PACIBSP:
9655   case AArch64::PACIASPPC:
9656   case AArch64::PACIBSPPC:
9657   case AArch64::AUTIASP:
9658   case AArch64::AUTIBSP:
9659   case AArch64::AUTIASPPCi:
9660   case AArch64::AUTIASPPCr:
9661   case AArch64::AUTIBSPPCi:
9662   case AArch64::AUTIBSPPCr:
9663   case AArch64::RETAA:
9664   case AArch64::RETAB:
9665   case AArch64::RETAASPPCi:
9666   case AArch64::RETAASPPCr:
9667   case AArch64::RETABSPPCi:
9668   case AArch64::RETABSPPCr:
9669   case AArch64::EMITBKEY:
9670   case AArch64::PAUTH_PROLOGUE:
9671   case AArch64::PAUTH_EPILOGUE:
9672     return outliner::InstrType::Illegal;
9673   }
9674 
9675   // We can only outline these if we will tail call the outlined function, or
9676   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
9677   // in a tail call.
9678   //
9679   // FIXME: If the proper fixups for the offset are implemented, this should be
9680   // possible.
9681   if (MI.isCFIInstruction())
9682     return outliner::InstrType::Legal;
9683 
9684   // Is this a terminator for a basic block?
9685   if (MI.isTerminator())
9686     // TargetInstrInfo::getOutliningType has already filtered out anything
9687     // that would break this, so we can allow it here.
9688     return outliner::InstrType::Legal;
9689 
9690   // Make sure none of the operands are un-outlinable.
9691   for (const MachineOperand &MOP : MI.operands()) {
9692     // A check preventing CFI indices was here before, but only CFI
9693     // instructions should have those.
9694     assert(!MOP.isCFIIndex());
9695 
9696     // If it uses LR or W30 explicitly, then don't touch it.
9697     if (MOP.isReg() && !MOP.isImplicit() &&
9698         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
9699       return outliner::InstrType::Illegal;
9700   }
9701 
9702   // Special cases for instructions that can always be outlined, but will fail
9703   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
9704   // be outlined because they don't require a *specific* value to be in LR.
9705   if (MI.getOpcode() == AArch64::ADRP)
9706     return outliner::InstrType::Legal;
9707 
9708   // If MI is a call we might be able to outline it. We don't want to outline
9709   // any calls that rely on the position of items on the stack. When we outline
9710   // something containing a call, we have to emit a save and restore of LR in
9711   // the outlined function. Currently, this always happens by saving LR to the
9712   // stack. Thus, if we outline, say, half the parameters for a function call
9713   // plus the call, then we'll break the callee's expectations for the layout
9714   // of the stack.
9715   //
9716   // FIXME: Allow calls to functions which construct a stack frame, as long
9717   // as they don't access arguments on the stack.
9718   // FIXME: Figure out some way to analyze functions defined in other modules.
9719   // We should be able to compute the memory usage based on the IR calling
9720   // convention, even if we can't see the definition.
9721   if (MI.isCall()) {
9722     // Get the function associated with the call. Look at each operand and find
9723     // the one that represents the callee and get its name.
9724     const Function *Callee = nullptr;
9725     for (const MachineOperand &MOP : MI.operands()) {
9726       if (MOP.isGlobal()) {
9727         Callee = dyn_cast<Function>(MOP.getGlobal());
9728         break;
9729       }
9730     }
9731 
9732     // Never outline calls to mcount.  There isn't any rule that would require
9733     // this, but the Linux kernel's "ftrace" feature depends on it.
9734     if (Callee && Callee->getName() == "\01_mcount")
9735       return outliner::InstrType::Illegal;
9736 
9737     // If we don't know anything about the callee, assume it depends on the
9738     // stack layout of the caller. In that case, it's only legal to outline
9739     // as a tail-call. Explicitly list the call instructions we know about so we
9740     // don't get unexpected results with call pseudo-instructions.
9741     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
9742     if (MI.getOpcode() == AArch64::BLR ||
9743         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
9744       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
9745 
9746     if (!Callee)
9747       return UnknownCallOutlineType;
9748 
9749     // We have a function we have information about. Check it if it's something
9750     // can safely outline.
9751     MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
9752 
9753     // We don't know what's going on with the callee at all. Don't touch it.
9754     if (!CalleeMF)
9755       return UnknownCallOutlineType;
9756 
9757     // Check if we know anything about the callee saves on the function. If we
9758     // don't, then don't touch it, since that implies that we haven't
9759     // computed anything about its stack frame yet.
9760     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
9761     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
9762         MFI.getNumObjects() > 0)
9763       return UnknownCallOutlineType;
9764 
9765     // At this point, we can say that CalleeMF ought to not pass anything on the
9766     // stack. Therefore, we can outline it.
9767     return outliner::InstrType::Legal;
9768   }
9769 
9770   // Don't touch the link register or W30.
9771   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
9772       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9773     return outliner::InstrType::Illegal;
9774 
9775   // Don't outline BTI instructions, because that will prevent the outlining
9776   // site from being indirectly callable.
9777   if (hasBTISemantics(MI))
9778     return outliner::InstrType::Illegal;
9779 
9780   return outliner::InstrType::Legal;
9781 }
9782 
fixupPostOutline(MachineBasicBlock & MBB) const9783 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9784   for (MachineInstr &MI : MBB) {
9785     const MachineOperand *Base;
9786     TypeSize Width(0, false);
9787     int64_t Offset;
9788     bool OffsetIsScalable;
9789 
9790     // Is this a load or store with an immediate offset with SP as the base?
9791     if (!MI.mayLoadOrStore() ||
9792         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9793                                       &RI) ||
9794         (Base->isReg() && Base->getReg() != AArch64::SP))
9795       continue;
9796 
9797     // It is, so we have to fix it up.
9798     TypeSize Scale(0U, false);
9799     int64_t Dummy1, Dummy2;
9800 
9801     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9802     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9803     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9804     assert(Scale != 0 && "Unexpected opcode!");
9805     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9806 
9807     // We've pushed the return address to the stack, so add 16 to the offset.
9808     // This is safe, since we already checked if it would overflow when we
9809     // checked if this instruction was legal to outline.
9810     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9811     StackOffsetOperand.setImm(NewImm);
9812   }
9813 }
9814 
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)9815 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9816                                  const AArch64InstrInfo *TII,
9817                                  bool ShouldSignReturnAddr) {
9818   if (!ShouldSignReturnAddr)
9819     return;
9820 
9821   BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9822       .setMIFlag(MachineInstr::FrameSetup);
9823   BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9824           TII->get(AArch64::PAUTH_EPILOGUE))
9825       .setMIFlag(MachineInstr::FrameDestroy);
9826 }
9827 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const9828 void AArch64InstrInfo::buildOutlinedFrame(
9829     MachineBasicBlock &MBB, MachineFunction &MF,
9830     const outliner::OutlinedFunction &OF) const {
9831 
9832   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9833 
9834   if (OF.FrameConstructionID == MachineOutlinerTailCall)
9835     FI->setOutliningStyle("Tail Call");
9836   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9837     // For thunk outlining, rewrite the last instruction from a call to a
9838     // tail-call.
9839     MachineInstr *Call = &*--MBB.instr_end();
9840     unsigned TailOpcode;
9841     if (Call->getOpcode() == AArch64::BL) {
9842       TailOpcode = AArch64::TCRETURNdi;
9843     } else {
9844       assert(Call->getOpcode() == AArch64::BLR ||
9845              Call->getOpcode() == AArch64::BLRNoIP);
9846       TailOpcode = AArch64::TCRETURNriALL;
9847     }
9848     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9849                            .add(Call->getOperand(0))
9850                            .addImm(0);
9851     MBB.insert(MBB.end(), TC);
9852     Call->eraseFromParent();
9853 
9854     FI->setOutliningStyle("Thunk");
9855   }
9856 
9857   bool IsLeafFunction = true;
9858 
9859   // Is there a call in the outlined range?
9860   auto IsNonTailCall = [](const MachineInstr &MI) {
9861     return MI.isCall() && !MI.isReturn();
9862   };
9863 
9864   if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9865     // Fix up the instructions in the range, since we're going to modify the
9866     // stack.
9867 
9868     // Bugzilla ID: 46767
9869     // TODO: Check if fixing up twice is safe so we can outline these.
9870     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9871            "Can only fix up stack references once");
9872     fixupPostOutline(MBB);
9873 
9874     IsLeafFunction = false;
9875 
9876     // LR has to be a live in so that we can save it.
9877     if (!MBB.isLiveIn(AArch64::LR))
9878       MBB.addLiveIn(AArch64::LR);
9879 
9880     MachineBasicBlock::iterator It = MBB.begin();
9881     MachineBasicBlock::iterator Et = MBB.end();
9882 
9883     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9884         OF.FrameConstructionID == MachineOutlinerThunk)
9885       Et = std::prev(MBB.end());
9886 
9887     // Insert a save before the outlined region
9888     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9889                                 .addReg(AArch64::SP, RegState::Define)
9890                                 .addReg(AArch64::LR)
9891                                 .addReg(AArch64::SP)
9892                                 .addImm(-16);
9893     It = MBB.insert(It, STRXpre);
9894 
9895     if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9896       CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
9897 
9898       // Add a CFI saying the stack was moved 16 B down.
9899       CFIBuilder.buildDefCFAOffset(16);
9900 
9901       // Add a CFI saying that the LR that we want to find is now 16 B higher
9902       // than before.
9903       CFIBuilder.buildOffset(AArch64::LR, -16);
9904     }
9905 
9906     // Insert a restore before the terminator for the function.
9907     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9908                                  .addReg(AArch64::SP, RegState::Define)
9909                                  .addReg(AArch64::LR, RegState::Define)
9910                                  .addReg(AArch64::SP)
9911                                  .addImm(16);
9912     Et = MBB.insert(Et, LDRXpost);
9913   }
9914 
9915   bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9916 
9917   // If this is a tail call outlined function, then there's already a return.
9918   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9919       OF.FrameConstructionID == MachineOutlinerThunk) {
9920     signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9921     return;
9922   }
9923 
9924   // It's not a tail call, so we have to insert the return ourselves.
9925 
9926   // LR has to be a live in so that we can return to it.
9927   if (!MBB.isLiveIn(AArch64::LR))
9928     MBB.addLiveIn(AArch64::LR);
9929 
9930   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9931                           .addReg(AArch64::LR);
9932   MBB.insert(MBB.end(), ret);
9933 
9934   signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9935 
9936   FI->setOutliningStyle("Function");
9937 
9938   // Did we have to modify the stack by saving the link register?
9939   if (OF.FrameConstructionID != MachineOutlinerDefault)
9940     return;
9941 
9942   // We modified the stack.
9943   // Walk over the basic block and fix up all the stack accesses.
9944   fixupPostOutline(MBB);
9945 }
9946 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9947 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9948     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9949     MachineFunction &MF, outliner::Candidate &C) const {
9950 
9951   // Are we tail calling?
9952   if (C.CallConstructionID == MachineOutlinerTailCall) {
9953     // If yes, then we can just branch to the label.
9954     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9955                             .addGlobalAddress(M.getNamedValue(MF.getName()))
9956                             .addImm(0));
9957     return It;
9958   }
9959 
9960   // Are we saving the link register?
9961   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9962       C.CallConstructionID == MachineOutlinerThunk) {
9963     // No, so just insert the call.
9964     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9965                             .addGlobalAddress(M.getNamedValue(MF.getName())));
9966     return It;
9967   }
9968 
9969   // We want to return the spot where we inserted the call.
9970   MachineBasicBlock::iterator CallPt;
9971 
9972   // Instructions for saving and restoring LR around the call instruction we're
9973   // going to insert.
9974   MachineInstr *Save;
9975   MachineInstr *Restore;
9976   // Can we save to a register?
9977   if (C.CallConstructionID == MachineOutlinerRegSave) {
9978     // FIXME: This logic should be sunk into a target-specific interface so that
9979     // we don't have to recompute the register.
9980     Register Reg = findRegisterToSaveLRTo(C);
9981     assert(Reg && "No callee-saved register available?");
9982 
9983     // LR has to be a live in so that we can save it.
9984     if (!MBB.isLiveIn(AArch64::LR))
9985       MBB.addLiveIn(AArch64::LR);
9986 
9987     // Save and restore LR from Reg.
9988     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9989                .addReg(AArch64::XZR)
9990                .addReg(AArch64::LR)
9991                .addImm(0);
9992     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9993                 .addReg(AArch64::XZR)
9994                 .addReg(Reg)
9995                 .addImm(0);
9996   } else {
9997     // We have the default case. Save and restore from SP.
9998     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9999                .addReg(AArch64::SP, RegState::Define)
10000                .addReg(AArch64::LR)
10001                .addReg(AArch64::SP)
10002                .addImm(-16);
10003     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10004                   .addReg(AArch64::SP, RegState::Define)
10005                   .addReg(AArch64::LR, RegState::Define)
10006                   .addReg(AArch64::SP)
10007                   .addImm(16);
10008   }
10009 
10010   It = MBB.insert(It, Save);
10011   It++;
10012 
10013   // Insert the call.
10014   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10015                           .addGlobalAddress(M.getNamedValue(MF.getName())));
10016   CallPt = It;
10017   It++;
10018 
10019   It = MBB.insert(It, Restore);
10020   return CallPt;
10021 }
10022 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const10023 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10024   MachineFunction &MF) const {
10025   return MF.getFunction().hasMinSize();
10026 }
10027 
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const10028 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10029                                           MachineBasicBlock::iterator Iter,
10030                                           DebugLoc &DL,
10031                                           bool AllowSideEffects) const {
10032   const MachineFunction &MF = *MBB.getParent();
10033   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10034   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10035 
10036   if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10037     BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
10038   } else if (STI.isSVEorStreamingSVEAvailable()) {
10039     BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
10040       .addImm(0)
10041       .addImm(0);
10042   } else if (STI.isNeonAvailable()) {
10043     BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
10044       .addImm(0);
10045   } else {
10046     // This is a streaming-compatible function without SVE. We don't have full
10047     // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10048     // So given `movi v..` would be illegal use `fmov d..` instead.
10049     assert(STI.hasNEON() && "Expected to have NEON.");
10050     Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
10051     BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
10052   }
10053 }
10054 
10055 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const10056 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
10057 
10058   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10059   // and zero immediate operands used as an alias for mov instruction.
10060   if (((MI.getOpcode() == AArch64::ORRWrs &&
10061         MI.getOperand(1).getReg() == AArch64::WZR &&
10062         MI.getOperand(3).getImm() == 0x0) ||
10063        (MI.getOpcode() == AArch64::ORRWrr &&
10064         MI.getOperand(1).getReg() == AArch64::WZR)) &&
10065       // Check that the w->w move is not a zero-extending w->x mov.
10066       (!MI.getOperand(0).getReg().isVirtual() ||
10067        MI.getOperand(0).getSubReg() == 0) &&
10068       (!MI.getOperand(0).getReg().isPhysical() ||
10069        MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
10070                                     /*TRI=*/nullptr) == -1))
10071     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10072 
10073   if (MI.getOpcode() == AArch64::ORRXrs &&
10074       MI.getOperand(1).getReg() == AArch64::XZR &&
10075       MI.getOperand(3).getImm() == 0x0)
10076     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10077 
10078   return std::nullopt;
10079 }
10080 
10081 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const10082 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
10083   if ((MI.getOpcode() == AArch64::ORRWrs &&
10084        MI.getOperand(1).getReg() == AArch64::WZR &&
10085        MI.getOperand(3).getImm() == 0x0) ||
10086       (MI.getOpcode() == AArch64::ORRWrr &&
10087        MI.getOperand(1).getReg() == AArch64::WZR))
10088     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10089   return std::nullopt;
10090 }
10091 
10092 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const10093 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
10094   int Sign = 1;
10095   int64_t Offset = 0;
10096 
10097   // TODO: Handle cases where Reg is a super- or sub-register of the
10098   // destination register.
10099   const MachineOperand &Op0 = MI.getOperand(0);
10100   if (!Op0.isReg() || Reg != Op0.getReg())
10101     return std::nullopt;
10102 
10103   switch (MI.getOpcode()) {
10104   default:
10105     return std::nullopt;
10106   case AArch64::SUBWri:
10107   case AArch64::SUBXri:
10108   case AArch64::SUBSWri:
10109   case AArch64::SUBSXri:
10110     Sign *= -1;
10111     [[fallthrough]];
10112   case AArch64::ADDSWri:
10113   case AArch64::ADDSXri:
10114   case AArch64::ADDWri:
10115   case AArch64::ADDXri: {
10116     // TODO: Third operand can be global address (usually some string).
10117     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
10118         !MI.getOperand(2).isImm())
10119       return std::nullopt;
10120     int Shift = MI.getOperand(3).getImm();
10121     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
10122     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
10123   }
10124   }
10125   return RegImmPair{MI.getOperand(1).getReg(), Offset};
10126 }
10127 
10128 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
10129 /// the destination register then, if possible, describe the value in terms of
10130 /// the source register.
10131 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)10132 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
10133                        const TargetInstrInfo *TII,
10134                        const TargetRegisterInfo *TRI) {
10135   auto DestSrc = TII->isCopyLikeInstr(MI);
10136   if (!DestSrc)
10137     return std::nullopt;
10138 
10139   Register DestReg = DestSrc->Destination->getReg();
10140   Register SrcReg = DestSrc->Source->getReg();
10141 
10142   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10143 
10144   // If the described register is the destination, just return the source.
10145   if (DestReg == DescribedReg)
10146     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10147 
10148   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
10149   if (MI.getOpcode() == AArch64::ORRWrs &&
10150       TRI->isSuperRegister(DestReg, DescribedReg))
10151     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10152 
10153   // We may need to describe the lower part of a ORRXrs move.
10154   if (MI.getOpcode() == AArch64::ORRXrs &&
10155       TRI->isSubRegister(DestReg, DescribedReg)) {
10156     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
10157     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10158   }
10159 
10160   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
10161          "Unhandled ORR[XW]rs copy case");
10162 
10163   return std::nullopt;
10164 }
10165 
isFunctionSafeToSplit(const MachineFunction & MF) const10166 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
10167   // Functions cannot be split to different sections on AArch64 if they have
10168   // a red zone. This is because relaxing a cross-section branch may require
10169   // incrementing the stack pointer to spill a register, which would overwrite
10170   // the red zone.
10171   if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
10172     return false;
10173 
10174   return TargetInstrInfo::isFunctionSafeToSplit(MF);
10175 }
10176 
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const10177 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
10178     const MachineBasicBlock &MBB) const {
10179   // Asm Goto blocks can contain conditional branches to goto labels, which can
10180   // get moved out of range of the branch instruction.
10181   auto isAsmGoto = [](const MachineInstr &MI) {
10182     return MI.getOpcode() == AArch64::INLINEASM_BR;
10183   };
10184   if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
10185     return false;
10186 
10187   // Because jump tables are label-relative instead of table-relative, they all
10188   // must be in the same section or relocation fixup handling will fail.
10189 
10190   // Check if MBB is a jump table target
10191   const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
10192   auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
10193     return llvm::is_contained(JTE.MBBs, &MBB);
10194   };
10195   if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
10196     return false;
10197 
10198   // Check if MBB contains a jump table lookup
10199   for (const MachineInstr &MI : MBB) {
10200     switch (MI.getOpcode()) {
10201     case TargetOpcode::G_BRJT:
10202     case AArch64::JumpTableDest32:
10203     case AArch64::JumpTableDest16:
10204     case AArch64::JumpTableDest8:
10205       return false;
10206     default:
10207       continue;
10208     }
10209   }
10210 
10211   // MBB isn't a special case, so it's safe to be split to the cold section.
10212   return true;
10213 }
10214 
10215 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const10216 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
10217                                       Register Reg) const {
10218   const MachineFunction *MF = MI.getMF();
10219   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
10220   switch (MI.getOpcode()) {
10221   case AArch64::MOVZWi:
10222   case AArch64::MOVZXi: {
10223     // MOVZWi may be used for producing zero-extended 32-bit immediates in
10224     // 64-bit parameters, so we need to consider super-registers.
10225     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10226       return std::nullopt;
10227 
10228     if (!MI.getOperand(1).isImm())
10229       return std::nullopt;
10230     int64_t Immediate = MI.getOperand(1).getImm();
10231     int Shift = MI.getOperand(2).getImm();
10232     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
10233                             nullptr);
10234   }
10235   case AArch64::ORRWrs:
10236   case AArch64::ORRXrs:
10237     return describeORRLoadedValue(MI, Reg, this, TRI);
10238   }
10239 
10240   return TargetInstrInfo::describeLoadedValue(MI, Reg);
10241 }
10242 
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const10243 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
10244     MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
10245   assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
10246          ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
10247          ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
10248 
10249   // Anyexts are nops.
10250   if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
10251     return true;
10252 
10253   Register DefReg = ExtMI.getOperand(0).getReg();
10254   if (!MRI.hasOneNonDBGUse(DefReg))
10255     return false;
10256 
10257   // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
10258   // addressing mode.
10259   auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
10260   return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
10261 }
10262 
getElementSizeForOpcode(unsigned Opc) const10263 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
10264   return get(Opc).TSFlags & AArch64::ElementSizeMask;
10265 }
10266 
isPTestLikeOpcode(unsigned Opc) const10267 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
10268   return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
10269 }
10270 
isWhileOpcode(unsigned Opc) const10271 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
10272   return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
10273 }
10274 
10275 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const10276 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
10277   return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
10278 }
10279 
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const10280 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
10281                                              unsigned Scale) const {
10282   if (Offset && Scale)
10283     return false;
10284 
10285   // Check Reg + Imm
10286   if (!Scale) {
10287     // 9-bit signed offset
10288     if (isInt<9>(Offset))
10289       return true;
10290 
10291     // 12-bit unsigned offset
10292     unsigned Shift = Log2_64(NumBytes);
10293     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
10294         // Must be a multiple of NumBytes (NumBytes is a power of 2)
10295         (Offset >> Shift) << Shift == Offset)
10296       return true;
10297     return false;
10298   }
10299 
10300   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
10301   return Scale == 1 || (Scale > 0 && Scale == NumBytes);
10302 }
10303 
getBLRCallOpcode(const MachineFunction & MF)10304 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
10305   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
10306     return AArch64::BLRNoIP;
10307   else
10308     return AArch64::BLR;
10309 }
10310 
10311 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const10312 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
10313                                    Register TargetReg, bool FrameSetup) const {
10314   assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
10315 
10316   MachineBasicBlock &MBB = *MBBI->getParent();
10317   MachineFunction &MF = *MBB.getParent();
10318   const AArch64InstrInfo *TII =
10319       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
10320   int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
10321   DebugLoc DL = MBB.findDebugLoc(MBBI);
10322 
10323   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
10324   MachineBasicBlock *LoopTestMBB =
10325       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10326   MF.insert(MBBInsertPoint, LoopTestMBB);
10327   MachineBasicBlock *LoopBodyMBB =
10328       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10329   MF.insert(MBBInsertPoint, LoopBodyMBB);
10330   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10331   MF.insert(MBBInsertPoint, ExitMBB);
10332   MachineInstr::MIFlag Flags =
10333       FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
10334 
10335   // LoopTest:
10336   //   SUB SP, SP, #ProbeSize
10337   emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
10338                   AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
10339 
10340   //   CMP SP, TargetReg
10341   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
10342           AArch64::XZR)
10343       .addReg(AArch64::SP)
10344       .addReg(TargetReg)
10345       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
10346       .setMIFlags(Flags);
10347 
10348   //   B.<Cond> LoopExit
10349   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
10350       .addImm(AArch64CC::LE)
10351       .addMBB(ExitMBB)
10352       .setMIFlags(Flags);
10353 
10354   //   STR XZR, [SP]
10355   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
10356       .addReg(AArch64::XZR)
10357       .addReg(AArch64::SP)
10358       .addImm(0)
10359       .setMIFlags(Flags);
10360 
10361   //   B loop
10362   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
10363       .addMBB(LoopTestMBB)
10364       .setMIFlags(Flags);
10365 
10366   // LoopExit:
10367   //   MOV SP, TargetReg
10368   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
10369       .addReg(TargetReg)
10370       .addImm(0)
10371       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
10372       .setMIFlags(Flags);
10373 
10374   //   LDR XZR, [SP]
10375   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
10376       .addReg(AArch64::XZR, RegState::Define)
10377       .addReg(AArch64::SP)
10378       .addImm(0)
10379       .setMIFlags(Flags);
10380 
10381   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
10382   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
10383 
10384   LoopTestMBB->addSuccessor(ExitMBB);
10385   LoopTestMBB->addSuccessor(LoopBodyMBB);
10386   LoopBodyMBB->addSuccessor(LoopTestMBB);
10387   MBB.addSuccessor(LoopTestMBB);
10388 
10389   // Update liveins.
10390   if (MF.getRegInfo().reservedRegsFrozen())
10391     fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
10392 
10393   return ExitMBB->begin();
10394 }
10395 
10396 namespace {
10397 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
10398   MachineFunction *MF;
10399   const TargetInstrInfo *TII;
10400   const TargetRegisterInfo *TRI;
10401   MachineRegisterInfo &MRI;
10402 
10403   /// The block of the loop
10404   MachineBasicBlock *LoopBB;
10405   /// The conditional branch of the loop
10406   MachineInstr *CondBranch;
10407   /// The compare instruction for loop control
10408   MachineInstr *Comp;
10409   /// The number of the operand of the loop counter value in Comp
10410   unsigned CompCounterOprNum;
10411   /// The instruction that updates the loop counter value
10412   MachineInstr *Update;
10413   /// The number of the operand of the loop counter value in Update
10414   unsigned UpdateCounterOprNum;
10415   /// The initial value of the loop counter
10416   Register Init;
10417   /// True iff Update is a predecessor of Comp
10418   bool IsUpdatePriorComp;
10419 
10420   /// The normalized condition used by createTripCountGreaterCondition()
10421   SmallVector<MachineOperand, 4> Cond;
10422 
10423 public:
AArch64PipelinerLoopInfo(MachineBasicBlock * LoopBB,MachineInstr * CondBranch,MachineInstr * Comp,unsigned CompCounterOprNum,MachineInstr * Update,unsigned UpdateCounterOprNum,Register Init,bool IsUpdatePriorComp,const SmallVectorImpl<MachineOperand> & Cond)10424   AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
10425                            MachineInstr *Comp, unsigned CompCounterOprNum,
10426                            MachineInstr *Update, unsigned UpdateCounterOprNum,
10427                            Register Init, bool IsUpdatePriorComp,
10428                            const SmallVectorImpl<MachineOperand> &Cond)
10429       : MF(Comp->getParent()->getParent()),
10430         TII(MF->getSubtarget().getInstrInfo()),
10431         TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
10432         LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
10433         CompCounterOprNum(CompCounterOprNum), Update(Update),
10434         UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
10435         IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
10436 
shouldIgnoreForPipelining(const MachineInstr * MI) const10437   bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
10438     // Make the instructions for loop control be placed in stage 0.
10439     // The predecessors of Comp are considered by the caller.
10440     return MI == Comp;
10441   }
10442 
createTripCountGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & CondParam)10443   std::optional<bool> createTripCountGreaterCondition(
10444       int TC, MachineBasicBlock &MBB,
10445       SmallVectorImpl<MachineOperand> &CondParam) override {
10446     // A branch instruction will be inserted as "if (Cond) goto epilogue".
10447     // Cond is normalized for such use.
10448     // The predecessors of the branch are assumed to have already been inserted.
10449     CondParam = Cond;
10450     return {};
10451   }
10452 
10453   void createRemainingIterationsGreaterCondition(
10454       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10455       DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
10456 
setPreheader(MachineBasicBlock * NewPreheader)10457   void setPreheader(MachineBasicBlock *NewPreheader) override {}
10458 
adjustTripCount(int TripCountAdjust)10459   void adjustTripCount(int TripCountAdjust) override {}
10460 
isMVEExpanderSupported()10461   bool isMVEExpanderSupported() override { return true; }
10462 };
10463 } // namespace
10464 
10465 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
10466 /// is replaced by ReplaceReg. The output register is newly created.
10467 /// The other operands are unchanged from MI.
cloneInstr(const MachineInstr * MI,unsigned ReplaceOprNum,Register ReplaceReg,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertTo)10468 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
10469                            Register ReplaceReg, MachineBasicBlock &MBB,
10470                            MachineBasicBlock::iterator InsertTo) {
10471   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10472   const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
10473   const TargetRegisterInfo *TRI =
10474       MBB.getParent()->getSubtarget().getRegisterInfo();
10475   MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
10476   Register Result = 0;
10477   for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
10478     if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
10479       Result = MRI.createVirtualRegister(
10480           MRI.getRegClass(NewMI->getOperand(0).getReg()));
10481       NewMI->getOperand(I).setReg(Result);
10482     } else if (I == ReplaceOprNum) {
10483       MRI.constrainRegClass(
10484           ReplaceReg,
10485           TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
10486       NewMI->getOperand(I).setReg(ReplaceReg);
10487     }
10488   }
10489   MBB.insert(InsertTo, NewMI);
10490   return Result;
10491 }
10492 
createRemainingIterationsGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & Cond,DenseMap<MachineInstr *,MachineInstr * > & LastStage0Insts)10493 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
10494     int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10495     DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
10496   // Create and accumulate conditions for next TC iterations.
10497   // Example:
10498   //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
10499   //                                          # iteration of the kernel
10500   //
10501   //   # insert the following instructions
10502   //   cond = CSINCXr 0, 0, C, implicit $nzcv
10503   //   counter = ADDXri counter, 1            # clone from this->Update
10504   //   SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
10505   //   cond = CSINCXr cond, cond, C, implicit $nzcv
10506   //   ... (repeat TC times)
10507   //   SUBSXri cond, 0, implicit-def $nzcv
10508 
10509   assert(CondBranch->getOpcode() == AArch64::Bcc);
10510   // CondCode to exit the loop
10511   AArch64CC::CondCode CC =
10512       (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
10513   if (CondBranch->getOperand(1).getMBB() == LoopBB)
10514     CC = AArch64CC::getInvertedCondCode(CC);
10515 
10516   // Accumulate conditions to exit the loop
10517   Register AccCond = AArch64::XZR;
10518 
10519   // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
10520   auto AccumulateCond = [&](Register CurCond,
10521                             AArch64CC::CondCode CC) -> Register {
10522     Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
10523     BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
10524         .addReg(NewCond, RegState::Define)
10525         .addReg(CurCond)
10526         .addReg(CurCond)
10527         .addImm(AArch64CC::getInvertedCondCode(CC));
10528     return NewCond;
10529   };
10530 
10531   if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
10532     // Update and Comp for I==0 are already exists in MBB
10533     // (MBB is an unrolled kernel)
10534     Register Counter;
10535     for (int I = 0; I <= TC; ++I) {
10536       Register NextCounter;
10537       if (I != 0)
10538         NextCounter =
10539             cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10540 
10541       AccCond = AccumulateCond(AccCond, CC);
10542 
10543       if (I != TC) {
10544         if (I == 0) {
10545           if (Update != Comp && IsUpdatePriorComp) {
10546             Counter =
10547                 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10548             NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
10549                                      MBB.end());
10550           } else {
10551             // can use already calculated value
10552             NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
10553           }
10554         } else if (Update != Comp) {
10555           NextCounter =
10556               cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10557         }
10558       }
10559       Counter = NextCounter;
10560     }
10561   } else {
10562     Register Counter;
10563     if (LastStage0Insts.empty()) {
10564       // use initial counter value (testing if the trip count is sufficient to
10565       // be executed by pipelined code)
10566       Counter = Init;
10567       if (IsUpdatePriorComp)
10568         Counter =
10569             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10570     } else {
10571       // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
10572       Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10573     }
10574 
10575     for (int I = 0; I <= TC; ++I) {
10576       Register NextCounter;
10577       NextCounter =
10578           cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10579       AccCond = AccumulateCond(AccCond, CC);
10580       if (I != TC && Update != Comp)
10581         NextCounter =
10582             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10583       Counter = NextCounter;
10584     }
10585   }
10586 
10587   // If AccCond == 0, the remainder is greater than TC.
10588   BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
10589       .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
10590       .addReg(AccCond)
10591       .addImm(0)
10592       .addImm(0);
10593   Cond.clear();
10594   Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
10595 }
10596 
extractPhiReg(const MachineInstr & Phi,const MachineBasicBlock * MBB,Register & RegMBB,Register & RegOther)10597 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
10598                           Register &RegMBB, Register &RegOther) {
10599   assert(Phi.getNumOperands() == 5);
10600   if (Phi.getOperand(2).getMBB() == MBB) {
10601     RegMBB = Phi.getOperand(1).getReg();
10602     RegOther = Phi.getOperand(3).getReg();
10603   } else {
10604     assert(Phi.getOperand(4).getMBB() == MBB);
10605     RegMBB = Phi.getOperand(3).getReg();
10606     RegOther = Phi.getOperand(1).getReg();
10607   }
10608 }
10609 
isDefinedOutside(Register Reg,const MachineBasicBlock * BB)10610 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
10611   if (!Reg.isVirtual())
10612     return false;
10613   const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10614   return MRI.getVRegDef(Reg)->getParent() != BB;
10615 }
10616 
10617 /// If Reg is an induction variable, return true and set some parameters
getIndVarInfo(Register Reg,const MachineBasicBlock * LoopBB,MachineInstr * & UpdateInst,unsigned & UpdateCounterOprNum,Register & InitReg,bool & IsUpdatePriorComp)10618 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
10619                           MachineInstr *&UpdateInst,
10620                           unsigned &UpdateCounterOprNum, Register &InitReg,
10621                           bool &IsUpdatePriorComp) {
10622   // Example:
10623   //
10624   // Preheader:
10625   //   InitReg = ...
10626   // LoopBB:
10627   //   Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
10628   //   Reg = COPY Reg0 ; COPY is ignored.
10629   //   Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
10630   //                     ; Reg is the value calculated in the previous
10631   //                     ; iteration, so IsUpdatePriorComp == false.
10632 
10633   if (LoopBB->pred_size() != 2)
10634     return false;
10635   if (!Reg.isVirtual())
10636     return false;
10637   const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
10638   UpdateInst = nullptr;
10639   UpdateCounterOprNum = 0;
10640   InitReg = 0;
10641   IsUpdatePriorComp = true;
10642   Register CurReg = Reg;
10643   while (true) {
10644     MachineInstr *Def = MRI.getVRegDef(CurReg);
10645     if (Def->getParent() != LoopBB)
10646       return false;
10647     if (Def->isCopy()) {
10648       // Ignore copy instructions unless they contain subregisters
10649       if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
10650         return false;
10651       CurReg = Def->getOperand(1).getReg();
10652     } else if (Def->isPHI()) {
10653       if (InitReg != 0)
10654         return false;
10655       if (!UpdateInst)
10656         IsUpdatePriorComp = false;
10657       extractPhiReg(*Def, LoopBB, CurReg, InitReg);
10658     } else {
10659       if (UpdateInst)
10660         return false;
10661       switch (Def->getOpcode()) {
10662       case AArch64::ADDSXri:
10663       case AArch64::ADDSWri:
10664       case AArch64::SUBSXri:
10665       case AArch64::SUBSWri:
10666       case AArch64::ADDXri:
10667       case AArch64::ADDWri:
10668       case AArch64::SUBXri:
10669       case AArch64::SUBWri:
10670         UpdateInst = Def;
10671         UpdateCounterOprNum = 1;
10672         break;
10673       case AArch64::ADDSXrr:
10674       case AArch64::ADDSWrr:
10675       case AArch64::SUBSXrr:
10676       case AArch64::SUBSWrr:
10677       case AArch64::ADDXrr:
10678       case AArch64::ADDWrr:
10679       case AArch64::SUBXrr:
10680       case AArch64::SUBWrr:
10681         UpdateInst = Def;
10682         if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
10683           UpdateCounterOprNum = 1;
10684         else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
10685           UpdateCounterOprNum = 2;
10686         else
10687           return false;
10688         break;
10689       default:
10690         return false;
10691       }
10692       CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
10693     }
10694 
10695     if (!CurReg.isVirtual())
10696       return false;
10697     if (Reg == CurReg)
10698       break;
10699   }
10700 
10701   if (!UpdateInst)
10702     return false;
10703 
10704   return true;
10705 }
10706 
10707 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock * LoopBB) const10708 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
10709   // Accept loops that meet the following conditions
10710   // * The conditional branch is BCC
10711   // * The compare instruction is ADDS/SUBS/WHILEXX
10712   // * One operand of the compare is an induction variable and the other is a
10713   //   loop invariant value
10714   // * The induction variable is incremented/decremented by a single instruction
10715   // * Does not contain CALL or instructions which have unmodeled side effects
10716 
10717   for (MachineInstr &MI : *LoopBB)
10718     if (MI.isCall() || MI.hasUnmodeledSideEffects())
10719       // This instruction may use NZCV, which interferes with the instruction to
10720       // be inserted for loop control.
10721       return nullptr;
10722 
10723   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
10724   SmallVector<MachineOperand, 4> Cond;
10725   if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
10726     return nullptr;
10727 
10728   // Infinite loops are not supported
10729   if (TBB == LoopBB && FBB == LoopBB)
10730     return nullptr;
10731 
10732   // Must be conditional branch
10733   if (TBB != LoopBB && FBB == nullptr)
10734     return nullptr;
10735 
10736   assert((TBB == LoopBB || FBB == LoopBB) &&
10737          "The Loop must be a single-basic-block loop");
10738 
10739   MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
10740   const TargetRegisterInfo &TRI = getRegisterInfo();
10741 
10742   if (CondBranch->getOpcode() != AArch64::Bcc)
10743     return nullptr;
10744 
10745   // Normalization for createTripCountGreaterCondition()
10746   if (TBB == LoopBB)
10747     reverseBranchCondition(Cond);
10748 
10749   MachineInstr *Comp = nullptr;
10750   unsigned CompCounterOprNum = 0;
10751   for (MachineInstr &MI : reverse(*LoopBB)) {
10752     if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
10753       // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
10754       // operands is a loop invariant value
10755 
10756       switch (MI.getOpcode()) {
10757       case AArch64::SUBSXri:
10758       case AArch64::SUBSWri:
10759       case AArch64::ADDSXri:
10760       case AArch64::ADDSWri:
10761         Comp = &MI;
10762         CompCounterOprNum = 1;
10763         break;
10764       case AArch64::ADDSWrr:
10765       case AArch64::ADDSXrr:
10766       case AArch64::SUBSWrr:
10767       case AArch64::SUBSXrr:
10768         Comp = &MI;
10769         break;
10770       default:
10771         if (isWhileOpcode(MI.getOpcode())) {
10772           Comp = &MI;
10773           break;
10774         }
10775         return nullptr;
10776       }
10777 
10778       if (CompCounterOprNum == 0) {
10779         if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10780           CompCounterOprNum = 2;
10781         else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10782           CompCounterOprNum = 1;
10783         else
10784           return nullptr;
10785       }
10786       break;
10787     }
10788   }
10789   if (!Comp)
10790     return nullptr;
10791 
10792   MachineInstr *Update = nullptr;
10793   Register Init;
10794   bool IsUpdatePriorComp;
10795   unsigned UpdateCounterOprNum;
10796   if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10797                      Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10798     return nullptr;
10799 
10800   return std::make_unique<AArch64PipelinerLoopInfo>(
10801       LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10802       Init, IsUpdatePriorComp, Cond);
10803 }
10804 
10805 /// verifyInstruction - Perform target specific instruction verification.
verifyInstruction(const MachineInstr & MI,StringRef & ErrInfo) const10806 bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
10807                                          StringRef &ErrInfo) const {
10808 
10809   // Verify that immediate offsets on load/store instructions are within range.
10810   // Stack objects with an FI operand are excluded as they can be fixed up
10811   // during PEI.
10812   TypeSize Scale(0U, false), Width(0U, false);
10813   int64_t MinOffset, MaxOffset;
10814   if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
10815     unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
10816     if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
10817       int64_t Imm = MI.getOperand(ImmIdx).getImm();
10818       if (Imm < MinOffset || Imm > MaxOffset) {
10819         ErrInfo = "Unexpected immediate on load/store instruction";
10820         return false;
10821       }
10822     }
10823   }
10824   return true;
10825 }
10826 
10827 #define GET_INSTRINFO_HELPERS
10828 #define GET_INSTRMAP_INFO
10829 #include "AArch64GenInstrInfo.inc"
10830