xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision c80e69b00d976a5a3b3e84527f270fa7e72a8205)
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64FrameLowering.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PointerAuth.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "MCTargetDesc/AArch64MCTargetDesc.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/ArrayRef.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/CodeGen/LivePhysRegs.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineCombinerPattern.h"
28 #include "llvm/CodeGen/MachineFrameInfo.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineMemOperand.h"
33 #include "llvm/CodeGen/MachineModuleInfo.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/RegisterScavenging.h"
37 #include "llvm/CodeGen/StackMaps.h"
38 #include "llvm/CodeGen/TargetRegisterInfo.h"
39 #include "llvm/CodeGen/TargetSubtargetInfo.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/DebugLoc.h"
42 #include "llvm/IR/GlobalValue.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/MC/MCAsmInfo.h"
45 #include "llvm/MC/MCInst.h"
46 #include "llvm/MC/MCInstBuilder.h"
47 #include "llvm/MC/MCInstrDesc.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CodeGen.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/LEB128.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <utility>
60 
61 using namespace llvm;
62 
63 #define GET_INSTRINFO_CTOR_DTOR
64 #include "AArch64GenInstrInfo.inc"
65 
66 static cl::opt<unsigned> TBZDisplacementBits(
67     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
68     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69 
70 static cl::opt<unsigned> CBZDisplacementBits(
71     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
72     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73 
74 static cl::opt<unsigned>
75     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
76                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77 
78 static cl::opt<unsigned>
79     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
80                       cl::desc("Restrict range of B instructions (DEBUG)"));
81 
AArch64InstrInfo(const AArch64Subtarget & STI)82 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
83     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
84                           AArch64::CATCHRET),
85       RI(STI.getTargetTriple()), Subtarget(STI) {}
86 
87 /// GetInstSize - Return the number of bytes of code the specified
88 /// instruction may be.  This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const89 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
90   const MachineBasicBlock &MBB = *MI.getParent();
91   const MachineFunction *MF = MBB.getParent();
92   const Function &F = MF->getFunction();
93   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
94 
95   {
96     auto Op = MI.getOpcode();
97     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
98       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
99   }
100 
101   // Meta-instructions emit no code.
102   if (MI.isMetaInstruction())
103     return 0;
104 
105   // FIXME: We currently only handle pseudoinstructions that don't get expanded
106   //        before the assembly printer.
107   unsigned NumBytes = 0;
108   const MCInstrDesc &Desc = MI.getDesc();
109 
110   // Size should be preferably set in
111   // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
112   // Specific cases handle instructions of variable sizes
113   switch (Desc.getOpcode()) {
114   default:
115     if (Desc.getSize())
116       return Desc.getSize();
117 
118     // Anything not explicitly designated otherwise (i.e. pseudo-instructions
119     // with fixed constant size but not specified in .td file) is a normal
120     // 4-byte insn.
121     NumBytes = 4;
122     break;
123   case TargetOpcode::STACKMAP:
124     // The upper bound for a stackmap intrinsic is the full length of its shadow
125     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
126     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
127     break;
128   case TargetOpcode::PATCHPOINT:
129     // The size of the patchpoint intrinsic is the number of bytes requested
130     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
131     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
132     break;
133   case TargetOpcode::STATEPOINT:
134     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
135     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
136     // No patch bytes means a normal call inst is emitted
137     if (NumBytes == 0)
138       NumBytes = 4;
139     break;
140   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
141     // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
142     // instructions are expanded to the specified number of NOPs. Otherwise,
143     // they are expanded to 36-byte XRay sleds.
144     NumBytes =
145         F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
146     break;
147   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
148   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
149     // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
150     NumBytes = 36;
151     break;
152   case TargetOpcode::PATCHABLE_EVENT_CALL:
153     // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
154     NumBytes = 24;
155     break;
156 
157   case AArch64::SPACE:
158     NumBytes = MI.getOperand(1).getImm();
159     break;
160   case TargetOpcode::BUNDLE:
161     NumBytes = getInstBundleLength(MI);
162     break;
163   }
164 
165   return NumBytes;
166 }
167 
getInstBundleLength(const MachineInstr & MI) const168 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
169   unsigned Size = 0;
170   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
171   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
172   while (++I != E && I->isInsideBundle()) {
173     assert(!I->isBundle() && "No nested bundle!");
174     Size += getInstSizeInBytes(*I);
175   }
176   return Size;
177 }
178 
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)179 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
180                             SmallVectorImpl<MachineOperand> &Cond) {
181   // Block ends with fall-through condbranch.
182   switch (LastInst->getOpcode()) {
183   default:
184     llvm_unreachable("Unknown branch instruction?");
185   case AArch64::Bcc:
186     Target = LastInst->getOperand(1).getMBB();
187     Cond.push_back(LastInst->getOperand(0));
188     break;
189   case AArch64::CBZW:
190   case AArch64::CBZX:
191   case AArch64::CBNZW:
192   case AArch64::CBNZX:
193     Target = LastInst->getOperand(1).getMBB();
194     Cond.push_back(MachineOperand::CreateImm(-1));
195     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
196     Cond.push_back(LastInst->getOperand(0));
197     break;
198   case AArch64::TBZW:
199   case AArch64::TBZX:
200   case AArch64::TBNZW:
201   case AArch64::TBNZX:
202     Target = LastInst->getOperand(2).getMBB();
203     Cond.push_back(MachineOperand::CreateImm(-1));
204     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
205     Cond.push_back(LastInst->getOperand(0));
206     Cond.push_back(LastInst->getOperand(1));
207   }
208 }
209 
getBranchDisplacementBits(unsigned Opc)210 static unsigned getBranchDisplacementBits(unsigned Opc) {
211   switch (Opc) {
212   default:
213     llvm_unreachable("unexpected opcode!");
214   case AArch64::B:
215     return BDisplacementBits;
216   case AArch64::TBNZW:
217   case AArch64::TBZW:
218   case AArch64::TBNZX:
219   case AArch64::TBZX:
220     return TBZDisplacementBits;
221   case AArch64::CBNZW:
222   case AArch64::CBZW:
223   case AArch64::CBNZX:
224   case AArch64::CBZX:
225     return CBZDisplacementBits;
226   case AArch64::Bcc:
227     return BCCDisplacementBits;
228   }
229 }
230 
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const231 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
232                                              int64_t BrOffset) const {
233   unsigned Bits = getBranchDisplacementBits(BranchOp);
234   assert(Bits >= 3 && "max branch displacement must be enough to jump"
235                       "over conditional branch expansion");
236   return isIntN(Bits, BrOffset / 4);
237 }
238 
239 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const240 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
241   switch (MI.getOpcode()) {
242   default:
243     llvm_unreachable("unexpected opcode!");
244   case AArch64::B:
245     return MI.getOperand(0).getMBB();
246   case AArch64::TBZW:
247   case AArch64::TBNZW:
248   case AArch64::TBZX:
249   case AArch64::TBNZX:
250     return MI.getOperand(2).getMBB();
251   case AArch64::CBZW:
252   case AArch64::CBNZW:
253   case AArch64::CBZX:
254   case AArch64::CBNZX:
255   case AArch64::Bcc:
256     return MI.getOperand(1).getMBB();
257   }
258 }
259 
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const260 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
261                                             MachineBasicBlock &NewDestBB,
262                                             MachineBasicBlock &RestoreBB,
263                                             const DebugLoc &DL,
264                                             int64_t BrOffset,
265                                             RegScavenger *RS) const {
266   assert(RS && "RegScavenger required for long branching");
267   assert(MBB.empty() &&
268          "new block should be inserted for expanding unconditional branch");
269   assert(MBB.pred_size() == 1);
270   assert(RestoreBB.empty() &&
271          "restore block should be inserted for restoring clobbered registers");
272 
273   auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
274     // Offsets outside of the signed 33-bit range are not supported for ADRP +
275     // ADD.
276     if (!isInt<33>(BrOffset))
277       report_fatal_error(
278           "Branch offsets outside of the signed 33-bit range not supported");
279 
280     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
281         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
282     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
283         .addReg(Reg)
284         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
285         .addImm(0);
286     BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
287   };
288 
289   RS->enterBasicBlockEnd(MBB);
290   // If X16 is unused, we can rely on the linker to insert a range extension
291   // thunk if NewDestBB is out of range of a single B instruction.
292   constexpr Register Reg = AArch64::X16;
293   if (!RS->isRegUsed(Reg)) {
294     insertUnconditionalBranch(MBB, &NewDestBB, DL);
295     RS->setRegUsed(Reg);
296     return;
297   }
298 
299   // If there's a free register and it's worth inflating the code size,
300   // manually insert the indirect branch.
301   Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
302   if (Scavenged != AArch64::NoRegister &&
303       MBB.getSectionID() == MBBSectionID::ColdSectionID) {
304     buildIndirectBranch(Scavenged, NewDestBB);
305     RS->setRegUsed(Scavenged);
306     return;
307   }
308 
309   // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
310   // with red zones.
311   AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
312   if (!AFI || AFI->hasRedZone().value_or(true))
313     report_fatal_error(
314         "Unable to insert indirect branch inside function that has red zone");
315 
316   // Otherwise, spill X16 and defer range extension to the linker.
317   BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
318       .addReg(AArch64::SP, RegState::Define)
319       .addReg(Reg)
320       .addReg(AArch64::SP)
321       .addImm(-16);
322 
323   BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
324 
325   BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
326       .addReg(AArch64::SP, RegState::Define)
327       .addReg(Reg, RegState::Define)
328       .addReg(AArch64::SP)
329       .addImm(16);
330 }
331 
332 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const333 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
334                                      MachineBasicBlock *&TBB,
335                                      MachineBasicBlock *&FBB,
336                                      SmallVectorImpl<MachineOperand> &Cond,
337                                      bool AllowModify) const {
338   // If the block has no terminators, it just falls into the block after it.
339   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340   if (I == MBB.end())
341     return false;
342 
343   // Skip over SpeculationBarrierEndBB terminators
344   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
345       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
346     --I;
347   }
348 
349   if (!isUnpredicatedTerminator(*I))
350     return false;
351 
352   // Get the last instruction in the block.
353   MachineInstr *LastInst = &*I;
354 
355   // If there is only one terminator instruction, process it.
356   unsigned LastOpc = LastInst->getOpcode();
357   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
358     if (isUncondBranchOpcode(LastOpc)) {
359       TBB = LastInst->getOperand(0).getMBB();
360       return false;
361     }
362     if (isCondBranchOpcode(LastOpc)) {
363       // Block ends with fall-through condbranch.
364       parseCondBranch(LastInst, TBB, Cond);
365       return false;
366     }
367     return true; // Can't handle indirect branch.
368   }
369 
370   // Get the instruction before it if it is a terminator.
371   MachineInstr *SecondLastInst = &*I;
372   unsigned SecondLastOpc = SecondLastInst->getOpcode();
373 
374   // If AllowModify is true and the block ends with two or more unconditional
375   // branches, delete all but the first unconditional branch.
376   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
377     while (isUncondBranchOpcode(SecondLastOpc)) {
378       LastInst->eraseFromParent();
379       LastInst = SecondLastInst;
380       LastOpc = LastInst->getOpcode();
381       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
382         // Return now the only terminator is an unconditional branch.
383         TBB = LastInst->getOperand(0).getMBB();
384         return false;
385       }
386       SecondLastInst = &*I;
387       SecondLastOpc = SecondLastInst->getOpcode();
388     }
389   }
390 
391   // If we're allowed to modify and the block ends in a unconditional branch
392   // which could simply fallthrough, remove the branch.  (Note: This case only
393   // matters when we can't understand the whole sequence, otherwise it's also
394   // handled by BranchFolding.cpp.)
395   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
396       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
397     LastInst->eraseFromParent();
398     LastInst = SecondLastInst;
399     LastOpc = LastInst->getOpcode();
400     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
401       assert(!isUncondBranchOpcode(LastOpc) &&
402              "unreachable unconditional branches removed above");
403 
404       if (isCondBranchOpcode(LastOpc)) {
405         // Block ends with fall-through condbranch.
406         parseCondBranch(LastInst, TBB, Cond);
407         return false;
408       }
409       return true; // Can't handle indirect branch.
410     }
411     SecondLastInst = &*I;
412     SecondLastOpc = SecondLastInst->getOpcode();
413   }
414 
415   // If there are three terminators, we don't know what sort of block this is.
416   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
417     return true;
418 
419   // If the block ends with a B and a Bcc, handle it.
420   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
421     parseCondBranch(SecondLastInst, TBB, Cond);
422     FBB = LastInst->getOperand(0).getMBB();
423     return false;
424   }
425 
426   // If the block ends with two unconditional branches, handle it.  The second
427   // one is not executed, so remove it.
428   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
429     TBB = SecondLastInst->getOperand(0).getMBB();
430     I = LastInst;
431     if (AllowModify)
432       I->eraseFromParent();
433     return false;
434   }
435 
436   // ...likewise if it ends with an indirect branch followed by an unconditional
437   // branch.
438   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
439     I = LastInst;
440     if (AllowModify)
441       I->eraseFromParent();
442     return true;
443   }
444 
445   // Otherwise, can't handle this.
446   return true;
447 }
448 
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const449 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
450                                               MachineBranchPredicate &MBP,
451                                               bool AllowModify) const {
452   // For the moment, handle only a block which ends with a cb(n)zx followed by
453   // a fallthrough.  Why this?  Because it is a common form.
454   // TODO: Should we handle b.cc?
455 
456   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
457   if (I == MBB.end())
458     return true;
459 
460   // Skip over SpeculationBarrierEndBB terminators
461   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
462       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
463     --I;
464   }
465 
466   if (!isUnpredicatedTerminator(*I))
467     return true;
468 
469   // Get the last instruction in the block.
470   MachineInstr *LastInst = &*I;
471   unsigned LastOpc = LastInst->getOpcode();
472   if (!isCondBranchOpcode(LastOpc))
473     return true;
474 
475   switch (LastOpc) {
476   default:
477     return true;
478   case AArch64::CBZW:
479   case AArch64::CBZX:
480   case AArch64::CBNZW:
481   case AArch64::CBNZX:
482     break;
483   };
484 
485   MBP.TrueDest = LastInst->getOperand(1).getMBB();
486   assert(MBP.TrueDest && "expected!");
487   MBP.FalseDest = MBB.getNextNode();
488 
489   MBP.ConditionDef = nullptr;
490   MBP.SingleUseCondition = false;
491 
492   MBP.LHS = LastInst->getOperand(0);
493   MBP.RHS = MachineOperand::CreateImm(0);
494   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
495                                             : MachineBranchPredicate::PRED_EQ;
496   return false;
497 }
498 
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const499 bool AArch64InstrInfo::reverseBranchCondition(
500     SmallVectorImpl<MachineOperand> &Cond) const {
501   if (Cond[0].getImm() != -1) {
502     // Regular Bcc
503     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
504     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
505   } else {
506     // Folded compare-and-branch
507     switch (Cond[1].getImm()) {
508     default:
509       llvm_unreachable("Unknown conditional branch!");
510     case AArch64::CBZW:
511       Cond[1].setImm(AArch64::CBNZW);
512       break;
513     case AArch64::CBNZW:
514       Cond[1].setImm(AArch64::CBZW);
515       break;
516     case AArch64::CBZX:
517       Cond[1].setImm(AArch64::CBNZX);
518       break;
519     case AArch64::CBNZX:
520       Cond[1].setImm(AArch64::CBZX);
521       break;
522     case AArch64::TBZW:
523       Cond[1].setImm(AArch64::TBNZW);
524       break;
525     case AArch64::TBNZW:
526       Cond[1].setImm(AArch64::TBZW);
527       break;
528     case AArch64::TBZX:
529       Cond[1].setImm(AArch64::TBNZX);
530       break;
531     case AArch64::TBNZX:
532       Cond[1].setImm(AArch64::TBZX);
533       break;
534     }
535   }
536 
537   return false;
538 }
539 
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const540 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
541                                         int *BytesRemoved) const {
542   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
543   if (I == MBB.end())
544     return 0;
545 
546   if (!isUncondBranchOpcode(I->getOpcode()) &&
547       !isCondBranchOpcode(I->getOpcode()))
548     return 0;
549 
550   // Remove the branch.
551   I->eraseFromParent();
552 
553   I = MBB.end();
554 
555   if (I == MBB.begin()) {
556     if (BytesRemoved)
557       *BytesRemoved = 4;
558     return 1;
559   }
560   --I;
561   if (!isCondBranchOpcode(I->getOpcode())) {
562     if (BytesRemoved)
563       *BytesRemoved = 4;
564     return 1;
565   }
566 
567   // Remove the branch.
568   I->eraseFromParent();
569   if (BytesRemoved)
570     *BytesRemoved = 8;
571 
572   return 2;
573 }
574 
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const575 void AArch64InstrInfo::instantiateCondBranch(
576     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
577     ArrayRef<MachineOperand> Cond) const {
578   if (Cond[0].getImm() != -1) {
579     // Regular Bcc
580     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
581   } else {
582     // Folded compare-and-branch
583     // Note that we use addOperand instead of addReg to keep the flags.
584     const MachineInstrBuilder MIB =
585         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
586     if (Cond.size() > 3)
587       MIB.addImm(Cond[3].getImm());
588     MIB.addMBB(TBB);
589   }
590 }
591 
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const592 unsigned AArch64InstrInfo::insertBranch(
593     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
594     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
595   // Shouldn't be a fall through.
596   assert(TBB && "insertBranch must not be told to insert a fallthrough");
597 
598   if (!FBB) {
599     if (Cond.empty()) // Unconditional branch?
600       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
601     else
602       instantiateCondBranch(MBB, DL, TBB, Cond);
603 
604     if (BytesAdded)
605       *BytesAdded = 4;
606 
607     return 1;
608   }
609 
610   // Two-way conditional branch.
611   instantiateCondBranch(MBB, DL, TBB, Cond);
612   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
613 
614   if (BytesAdded)
615     *BytesAdded = 8;
616 
617   return 2;
618 }
619 
620 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)621 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
622   while (Register::isVirtualRegister(VReg)) {
623     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
624     if (!DefMI->isFullCopy())
625       return VReg;
626     VReg = DefMI->getOperand(1).getReg();
627   }
628   return VReg;
629 }
630 
631 // Determine if VReg is defined by an instruction that can be folded into a
632 // csel instruction. If so, return the folded opcode, and the replacement
633 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)634 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
635                                 unsigned *NewVReg = nullptr) {
636   VReg = removeCopies(MRI, VReg);
637   if (!Register::isVirtualRegister(VReg))
638     return 0;
639 
640   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
641   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
642   unsigned Opc = 0;
643   unsigned SrcOpNum = 0;
644   switch (DefMI->getOpcode()) {
645   case AArch64::ADDSXri:
646   case AArch64::ADDSWri:
647     // if NZCV is used, do not fold.
648     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
649                                          true) == -1)
650       return 0;
651     // fall-through to ADDXri and ADDWri.
652     [[fallthrough]];
653   case AArch64::ADDXri:
654   case AArch64::ADDWri:
655     // add x, 1 -> csinc.
656     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
657         DefMI->getOperand(3).getImm() != 0)
658       return 0;
659     SrcOpNum = 1;
660     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
661     break;
662 
663   case AArch64::ORNXrr:
664   case AArch64::ORNWrr: {
665     // not x -> csinv, represented as orn dst, xzr, src.
666     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
667     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
668       return 0;
669     SrcOpNum = 2;
670     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
671     break;
672   }
673 
674   case AArch64::SUBSXrr:
675   case AArch64::SUBSWrr:
676     // if NZCV is used, do not fold.
677     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
678                                          true) == -1)
679       return 0;
680     // fall-through to SUBXrr and SUBWrr.
681     [[fallthrough]];
682   case AArch64::SUBXrr:
683   case AArch64::SUBWrr: {
684     // neg x -> csneg, represented as sub dst, xzr, src.
685     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
686     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
687       return 0;
688     SrcOpNum = 2;
689     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
690     break;
691   }
692   default:
693     return 0;
694   }
695   assert(Opc && SrcOpNum && "Missing parameters");
696 
697   if (NewVReg)
698     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
699   return Opc;
700 }
701 
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const702 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
703                                        ArrayRef<MachineOperand> Cond,
704                                        Register DstReg, Register TrueReg,
705                                        Register FalseReg, int &CondCycles,
706                                        int &TrueCycles,
707                                        int &FalseCycles) const {
708   // Check register classes.
709   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
710   const TargetRegisterClass *RC =
711       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
712   if (!RC)
713     return false;
714 
715   // Also need to check the dest regclass, in case we're trying to optimize
716   // something like:
717   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
718   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
719     return false;
720 
721   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
722   unsigned ExtraCondLat = Cond.size() != 1;
723 
724   // GPRs are handled by csel.
725   // FIXME: Fold in x+1, -x, and ~x when applicable.
726   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
727       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
728     // Single-cycle csel, csinc, csinv, and csneg.
729     CondCycles = 1 + ExtraCondLat;
730     TrueCycles = FalseCycles = 1;
731     if (canFoldIntoCSel(MRI, TrueReg))
732       TrueCycles = 0;
733     else if (canFoldIntoCSel(MRI, FalseReg))
734       FalseCycles = 0;
735     return true;
736   }
737 
738   // Scalar floating point is handled by fcsel.
739   // FIXME: Form fabs, fmin, and fmax when applicable.
740   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
741       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
742     CondCycles = 5 + ExtraCondLat;
743     TrueCycles = FalseCycles = 2;
744     return true;
745   }
746 
747   // Can't do vectors.
748   return false;
749 }
750 
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const751 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
752                                     MachineBasicBlock::iterator I,
753                                     const DebugLoc &DL, Register DstReg,
754                                     ArrayRef<MachineOperand> Cond,
755                                     Register TrueReg, Register FalseReg) const {
756   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
757 
758   // Parse the condition code, see parseCondBranch() above.
759   AArch64CC::CondCode CC;
760   switch (Cond.size()) {
761   default:
762     llvm_unreachable("Unknown condition opcode in Cond");
763   case 1: // b.cc
764     CC = AArch64CC::CondCode(Cond[0].getImm());
765     break;
766   case 3: { // cbz/cbnz
767     // We must insert a compare against 0.
768     bool Is64Bit;
769     switch (Cond[1].getImm()) {
770     default:
771       llvm_unreachable("Unknown branch opcode in Cond");
772     case AArch64::CBZW:
773       Is64Bit = false;
774       CC = AArch64CC::EQ;
775       break;
776     case AArch64::CBZX:
777       Is64Bit = true;
778       CC = AArch64CC::EQ;
779       break;
780     case AArch64::CBNZW:
781       Is64Bit = false;
782       CC = AArch64CC::NE;
783       break;
784     case AArch64::CBNZX:
785       Is64Bit = true;
786       CC = AArch64CC::NE;
787       break;
788     }
789     Register SrcReg = Cond[2].getReg();
790     if (Is64Bit) {
791       // cmp reg, #0 is actually subs xzr, reg, #0.
792       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
793       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
794           .addReg(SrcReg)
795           .addImm(0)
796           .addImm(0);
797     } else {
798       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
799       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
800           .addReg(SrcReg)
801           .addImm(0)
802           .addImm(0);
803     }
804     break;
805   }
806   case 4: { // tbz/tbnz
807     // We must insert a tst instruction.
808     switch (Cond[1].getImm()) {
809     default:
810       llvm_unreachable("Unknown branch opcode in Cond");
811     case AArch64::TBZW:
812     case AArch64::TBZX:
813       CC = AArch64CC::EQ;
814       break;
815     case AArch64::TBNZW:
816     case AArch64::TBNZX:
817       CC = AArch64CC::NE;
818       break;
819     }
820     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
821     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
822       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
823           .addReg(Cond[2].getReg())
824           .addImm(
825               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
826     else
827       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
828           .addReg(Cond[2].getReg())
829           .addImm(
830               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
831     break;
832   }
833   }
834 
835   unsigned Opc = 0;
836   const TargetRegisterClass *RC = nullptr;
837   bool TryFold = false;
838   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
839     RC = &AArch64::GPR64RegClass;
840     Opc = AArch64::CSELXr;
841     TryFold = true;
842   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
843     RC = &AArch64::GPR32RegClass;
844     Opc = AArch64::CSELWr;
845     TryFold = true;
846   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
847     RC = &AArch64::FPR64RegClass;
848     Opc = AArch64::FCSELDrrr;
849   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
850     RC = &AArch64::FPR32RegClass;
851     Opc = AArch64::FCSELSrrr;
852   }
853   assert(RC && "Unsupported regclass");
854 
855   // Try folding simple instructions into the csel.
856   if (TryFold) {
857     unsigned NewVReg = 0;
858     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
859     if (FoldedOpc) {
860       // The folded opcodes csinc, csinc and csneg apply the operation to
861       // FalseReg, so we need to invert the condition.
862       CC = AArch64CC::getInvertedCondCode(CC);
863       TrueReg = FalseReg;
864     } else
865       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
866 
867     // Fold the operation. Leave any dead instructions for DCE to clean up.
868     if (FoldedOpc) {
869       FalseReg = NewVReg;
870       Opc = FoldedOpc;
871       // The extends the live range of NewVReg.
872       MRI.clearKillFlags(NewVReg);
873     }
874   }
875 
876   // Pull all virtual register into the appropriate class.
877   MRI.constrainRegClass(TrueReg, RC);
878   MRI.constrainRegClass(FalseReg, RC);
879 
880   // Insert the csel.
881   BuildMI(MBB, I, DL, get(Opc), DstReg)
882       .addReg(TrueReg)
883       .addReg(FalseReg)
884       .addImm(CC);
885 }
886 
887 // Return true if Imm can be loaded into a register by a "cheap" sequence of
888 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)889 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
890   if (BitSize == 32)
891     return true;
892 
893   assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
894   uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
895   SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
896   AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
897 
898   return Is.size() <= 2;
899 }
900 
901 // FIXME: this implementation should be micro-architecture dependent, so a
902 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const903 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904   if (Subtarget.hasExynosCheapAsMoveHandling()) {
905     if (isExynosCheapAsMove(MI))
906       return true;
907     return MI.isAsCheapAsAMove();
908   }
909 
910   switch (MI.getOpcode()) {
911   default:
912     return MI.isAsCheapAsAMove();
913 
914   case AArch64::ADDWrs:
915   case AArch64::ADDXrs:
916   case AArch64::SUBWrs:
917   case AArch64::SUBXrs:
918     return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
919 
920   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
921   // ORRXri, it is as cheap as MOV.
922   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
923   case AArch64::MOVi32imm:
924     return isCheapImmediate(MI, 32);
925   case AArch64::MOVi64imm:
926     return isCheapImmediate(MI, 64);
927   }
928 }
929 
isFalkorShiftExtFast(const MachineInstr & MI)930 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
931   switch (MI.getOpcode()) {
932   default:
933     return false;
934 
935   case AArch64::ADDWrs:
936   case AArch64::ADDXrs:
937   case AArch64::ADDSWrs:
938   case AArch64::ADDSXrs: {
939     unsigned Imm = MI.getOperand(3).getImm();
940     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
941     if (ShiftVal == 0)
942       return true;
943     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
944   }
945 
946   case AArch64::ADDWrx:
947   case AArch64::ADDXrx:
948   case AArch64::ADDXrx64:
949   case AArch64::ADDSWrx:
950   case AArch64::ADDSXrx:
951   case AArch64::ADDSXrx64: {
952     unsigned Imm = MI.getOperand(3).getImm();
953     switch (AArch64_AM::getArithExtendType(Imm)) {
954     default:
955       return false;
956     case AArch64_AM::UXTB:
957     case AArch64_AM::UXTH:
958     case AArch64_AM::UXTW:
959     case AArch64_AM::UXTX:
960       return AArch64_AM::getArithShiftValue(Imm) <= 4;
961     }
962   }
963 
964   case AArch64::SUBWrs:
965   case AArch64::SUBSWrs: {
966     unsigned Imm = MI.getOperand(3).getImm();
967     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
968     return ShiftVal == 0 ||
969            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
970   }
971 
972   case AArch64::SUBXrs:
973   case AArch64::SUBSXrs: {
974     unsigned Imm = MI.getOperand(3).getImm();
975     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
976     return ShiftVal == 0 ||
977            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
978   }
979 
980   case AArch64::SUBWrx:
981   case AArch64::SUBXrx:
982   case AArch64::SUBXrx64:
983   case AArch64::SUBSWrx:
984   case AArch64::SUBSXrx:
985   case AArch64::SUBSXrx64: {
986     unsigned Imm = MI.getOperand(3).getImm();
987     switch (AArch64_AM::getArithExtendType(Imm)) {
988     default:
989       return false;
990     case AArch64_AM::UXTB:
991     case AArch64_AM::UXTH:
992     case AArch64_AM::UXTW:
993     case AArch64_AM::UXTX:
994       return AArch64_AM::getArithShiftValue(Imm) == 0;
995     }
996   }
997 
998   case AArch64::LDRBBroW:
999   case AArch64::LDRBBroX:
1000   case AArch64::LDRBroW:
1001   case AArch64::LDRBroX:
1002   case AArch64::LDRDroW:
1003   case AArch64::LDRDroX:
1004   case AArch64::LDRHHroW:
1005   case AArch64::LDRHHroX:
1006   case AArch64::LDRHroW:
1007   case AArch64::LDRHroX:
1008   case AArch64::LDRQroW:
1009   case AArch64::LDRQroX:
1010   case AArch64::LDRSBWroW:
1011   case AArch64::LDRSBWroX:
1012   case AArch64::LDRSBXroW:
1013   case AArch64::LDRSBXroX:
1014   case AArch64::LDRSHWroW:
1015   case AArch64::LDRSHWroX:
1016   case AArch64::LDRSHXroW:
1017   case AArch64::LDRSHXroX:
1018   case AArch64::LDRSWroW:
1019   case AArch64::LDRSWroX:
1020   case AArch64::LDRSroW:
1021   case AArch64::LDRSroX:
1022   case AArch64::LDRWroW:
1023   case AArch64::LDRWroX:
1024   case AArch64::LDRXroW:
1025   case AArch64::LDRXroX:
1026   case AArch64::PRFMroW:
1027   case AArch64::PRFMroX:
1028   case AArch64::STRBBroW:
1029   case AArch64::STRBBroX:
1030   case AArch64::STRBroW:
1031   case AArch64::STRBroX:
1032   case AArch64::STRDroW:
1033   case AArch64::STRDroX:
1034   case AArch64::STRHHroW:
1035   case AArch64::STRHHroX:
1036   case AArch64::STRHroW:
1037   case AArch64::STRHroX:
1038   case AArch64::STRQroW:
1039   case AArch64::STRQroX:
1040   case AArch64::STRSroW:
1041   case AArch64::STRSroX:
1042   case AArch64::STRWroW:
1043   case AArch64::STRWroX:
1044   case AArch64::STRXroW:
1045   case AArch64::STRXroX: {
1046     unsigned IsSigned = MI.getOperand(3).getImm();
1047     return !IsSigned;
1048   }
1049   }
1050 }
1051 
isSEHInstruction(const MachineInstr & MI)1052 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1053   unsigned Opc = MI.getOpcode();
1054   switch (Opc) {
1055     default:
1056       return false;
1057     case AArch64::SEH_StackAlloc:
1058     case AArch64::SEH_SaveFPLR:
1059     case AArch64::SEH_SaveFPLR_X:
1060     case AArch64::SEH_SaveReg:
1061     case AArch64::SEH_SaveReg_X:
1062     case AArch64::SEH_SaveRegP:
1063     case AArch64::SEH_SaveRegP_X:
1064     case AArch64::SEH_SaveFReg:
1065     case AArch64::SEH_SaveFReg_X:
1066     case AArch64::SEH_SaveFRegP:
1067     case AArch64::SEH_SaveFRegP_X:
1068     case AArch64::SEH_SetFP:
1069     case AArch64::SEH_AddFP:
1070     case AArch64::SEH_Nop:
1071     case AArch64::SEH_PrologEnd:
1072     case AArch64::SEH_EpilogStart:
1073     case AArch64::SEH_EpilogEnd:
1074     case AArch64::SEH_PACSignLR:
1075     case AArch64::SEH_SaveAnyRegQP:
1076     case AArch64::SEH_SaveAnyRegQPX:
1077       return true;
1078   }
1079 }
1080 
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1081 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1082                                              Register &SrcReg, Register &DstReg,
1083                                              unsigned &SubIdx) const {
1084   switch (MI.getOpcode()) {
1085   default:
1086     return false;
1087   case AArch64::SBFMXri: // aka sxtw
1088   case AArch64::UBFMXri: // aka uxtw
1089     // Check for the 32 -> 64 bit extension case, these instructions can do
1090     // much more.
1091     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1092       return false;
1093     // This is a signed or unsigned 32 -> 64 bit extension.
1094     SrcReg = MI.getOperand(1).getReg();
1095     DstReg = MI.getOperand(0).getReg();
1096     SubIdx = AArch64::sub_32;
1097     return true;
1098   }
1099 }
1100 
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1101 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1102     const MachineInstr &MIa, const MachineInstr &MIb) const {
1103   const TargetRegisterInfo *TRI = &getRegisterInfo();
1104   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1105   int64_t OffsetA = 0, OffsetB = 0;
1106   TypeSize WidthA(0, false), WidthB(0, false);
1107   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1108 
1109   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1110   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1111 
1112   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1113       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1114     return false;
1115 
1116   // Retrieve the base, offset from the base and width. Width
1117   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1118   // base are identical, and the offset of a lower memory access +
1119   // the width doesn't overlap the offset of a higher memory access,
1120   // then the memory accesses are different.
1121   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1122   // are assumed to have the same scale (vscale).
1123   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1124                                    WidthA, TRI) &&
1125       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1126                                    WidthB, TRI)) {
1127     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1128         OffsetAIsScalable == OffsetBIsScalable) {
1129       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1130       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1131       TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1132       if (LowWidth.isScalable() == OffsetAIsScalable &&
1133           LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1134         return true;
1135     }
1136   }
1137   return false;
1138 }
1139 
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1140 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1141                                             const MachineBasicBlock *MBB,
1142                                             const MachineFunction &MF) const {
1143   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1144     return true;
1145 
1146   // Do not move an instruction that can be recognized as a branch target.
1147   if (hasBTISemantics(MI))
1148     return true;
1149 
1150   switch (MI.getOpcode()) {
1151   case AArch64::HINT:
1152     // CSDB hints are scheduling barriers.
1153     if (MI.getOperand(0).getImm() == 0x14)
1154       return true;
1155     break;
1156   case AArch64::DSB:
1157   case AArch64::ISB:
1158     // DSB and ISB also are scheduling barriers.
1159     return true;
1160   case AArch64::MSRpstatesvcrImm1:
1161     // SMSTART and SMSTOP are also scheduling barriers.
1162     return true;
1163   default:;
1164   }
1165   if (isSEHInstruction(MI))
1166     return true;
1167   auto Next = std::next(MI.getIterator());
1168   return Next != MBB->end() && Next->isCFIInstruction();
1169 }
1170 
1171 /// analyzeCompare - For a comparison instruction, return the source registers
1172 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1173 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1174 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1175                                       Register &SrcReg2, int64_t &CmpMask,
1176                                       int64_t &CmpValue) const {
1177   // The first operand can be a frame index where we'd normally expect a
1178   // register.
1179   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1180   if (!MI.getOperand(1).isReg())
1181     return false;
1182 
1183   switch (MI.getOpcode()) {
1184   default:
1185     break;
1186   case AArch64::PTEST_PP:
1187   case AArch64::PTEST_PP_ANY:
1188     SrcReg = MI.getOperand(0).getReg();
1189     SrcReg2 = MI.getOperand(1).getReg();
1190     // Not sure about the mask and value for now...
1191     CmpMask = ~0;
1192     CmpValue = 0;
1193     return true;
1194   case AArch64::SUBSWrr:
1195   case AArch64::SUBSWrs:
1196   case AArch64::SUBSWrx:
1197   case AArch64::SUBSXrr:
1198   case AArch64::SUBSXrs:
1199   case AArch64::SUBSXrx:
1200   case AArch64::ADDSWrr:
1201   case AArch64::ADDSWrs:
1202   case AArch64::ADDSWrx:
1203   case AArch64::ADDSXrr:
1204   case AArch64::ADDSXrs:
1205   case AArch64::ADDSXrx:
1206     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1207     SrcReg = MI.getOperand(1).getReg();
1208     SrcReg2 = MI.getOperand(2).getReg();
1209     CmpMask = ~0;
1210     CmpValue = 0;
1211     return true;
1212   case AArch64::SUBSWri:
1213   case AArch64::ADDSWri:
1214   case AArch64::SUBSXri:
1215   case AArch64::ADDSXri:
1216     SrcReg = MI.getOperand(1).getReg();
1217     SrcReg2 = 0;
1218     CmpMask = ~0;
1219     CmpValue = MI.getOperand(2).getImm();
1220     return true;
1221   case AArch64::ANDSWri:
1222   case AArch64::ANDSXri:
1223     // ANDS does not use the same encoding scheme as the others xxxS
1224     // instructions.
1225     SrcReg = MI.getOperand(1).getReg();
1226     SrcReg2 = 0;
1227     CmpMask = ~0;
1228     CmpValue = AArch64_AM::decodeLogicalImmediate(
1229                    MI.getOperand(2).getImm(),
1230                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1231     return true;
1232   }
1233 
1234   return false;
1235 }
1236 
UpdateOperandRegClass(MachineInstr & Instr)1237 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1238   MachineBasicBlock *MBB = Instr.getParent();
1239   assert(MBB && "Can't get MachineBasicBlock here");
1240   MachineFunction *MF = MBB->getParent();
1241   assert(MF && "Can't get MachineFunction here");
1242   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1243   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1244   MachineRegisterInfo *MRI = &MF->getRegInfo();
1245 
1246   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1247        ++OpIdx) {
1248     MachineOperand &MO = Instr.getOperand(OpIdx);
1249     const TargetRegisterClass *OpRegCstraints =
1250         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1251 
1252     // If there's no constraint, there's nothing to do.
1253     if (!OpRegCstraints)
1254       continue;
1255     // If the operand is a frame index, there's nothing to do here.
1256     // A frame index operand will resolve correctly during PEI.
1257     if (MO.isFI())
1258       continue;
1259 
1260     assert(MO.isReg() &&
1261            "Operand has register constraints without being a register!");
1262 
1263     Register Reg = MO.getReg();
1264     if (Reg.isPhysical()) {
1265       if (!OpRegCstraints->contains(Reg))
1266         return false;
1267     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1268                !MRI->constrainRegClass(Reg, OpRegCstraints))
1269       return false;
1270   }
1271 
1272   return true;
1273 }
1274 
1275 /// Return the opcode that does not set flags when possible - otherwise
1276 /// return the original opcode. The caller is responsible to do the actual
1277 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1278 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1279   // Don't convert all compare instructions, because for some the zero register
1280   // encoding becomes the sp register.
1281   bool MIDefinesZeroReg = false;
1282   if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1283       MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1284     MIDefinesZeroReg = true;
1285 
1286   switch (MI.getOpcode()) {
1287   default:
1288     return MI.getOpcode();
1289   case AArch64::ADDSWrr:
1290     return AArch64::ADDWrr;
1291   case AArch64::ADDSWri:
1292     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1293   case AArch64::ADDSWrs:
1294     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1295   case AArch64::ADDSWrx:
1296     return AArch64::ADDWrx;
1297   case AArch64::ADDSXrr:
1298     return AArch64::ADDXrr;
1299   case AArch64::ADDSXri:
1300     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1301   case AArch64::ADDSXrs:
1302     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1303   case AArch64::ADDSXrx:
1304     return AArch64::ADDXrx;
1305   case AArch64::SUBSWrr:
1306     return AArch64::SUBWrr;
1307   case AArch64::SUBSWri:
1308     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1309   case AArch64::SUBSWrs:
1310     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1311   case AArch64::SUBSWrx:
1312     return AArch64::SUBWrx;
1313   case AArch64::SUBSXrr:
1314     return AArch64::SUBXrr;
1315   case AArch64::SUBSXri:
1316     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1317   case AArch64::SUBSXrs:
1318     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1319   case AArch64::SUBSXrx:
1320     return AArch64::SUBXrx;
1321   }
1322 }
1323 
1324 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1325 
1326 /// True when condition flags are accessed (either by writing or reading)
1327 /// on the instruction trace starting at From and ending at To.
1328 ///
1329 /// Note: If From and To are from different blocks it's assumed CC are accessed
1330 ///       on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1331 static bool areCFlagsAccessedBetweenInstrs(
1332     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1333     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1334   // Early exit if To is at the beginning of the BB.
1335   if (To == To->getParent()->begin())
1336     return true;
1337 
1338   // Check whether the instructions are in the same basic block
1339   // If not, assume the condition flags might get modified somewhere.
1340   if (To->getParent() != From->getParent())
1341     return true;
1342 
1343   // From must be above To.
1344   assert(std::any_of(
1345       ++To.getReverse(), To->getParent()->rend(),
1346       [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1347 
1348   // We iterate backward starting at \p To until we hit \p From.
1349   for (const MachineInstr &Instr :
1350        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1351     if (((AccessToCheck & AK_Write) &&
1352          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1353         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1354       return true;
1355   }
1356   return false;
1357 }
1358 
1359 std::optional<unsigned>
canRemovePTestInstr(MachineInstr * PTest,MachineInstr * Mask,MachineInstr * Pred,const MachineRegisterInfo * MRI) const1360 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1361                                       MachineInstr *Pred,
1362                                       const MachineRegisterInfo *MRI) const {
1363   unsigned MaskOpcode = Mask->getOpcode();
1364   unsigned PredOpcode = Pred->getOpcode();
1365   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1366   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1367 
1368   if (PredIsWhileLike) {
1369     // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1370     // instruction and the condition is "any" since WHILcc does an implicit
1371     // PTEST(ALL, PG) check and PG is always a subset of ALL.
1372     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1373       return PredOpcode;
1374 
1375     // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1376     // redundant since WHILE performs an implicit PTEST with an all active
1377     // mask.
1378     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1379         getElementSizeForOpcode(MaskOpcode) ==
1380             getElementSizeForOpcode(PredOpcode))
1381       return PredOpcode;
1382 
1383     return {};
1384   }
1385 
1386   if (PredIsPTestLike) {
1387     // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1388     // instruction that sets the flags as PTEST would and the condition is
1389     // "any" since PG is always a subset of the governing predicate of the
1390     // ptest-like instruction.
1391     if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1392       return PredOpcode;
1393 
1394     // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1395     // the element size matches and either the PTEST_LIKE instruction uses
1396     // the same all active mask or the condition is "any".
1397     if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1398         getElementSizeForOpcode(MaskOpcode) ==
1399             getElementSizeForOpcode(PredOpcode)) {
1400       auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1401       if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1402         return PredOpcode;
1403     }
1404 
1405     // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1406     // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1407     // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1408     // compare that also support 16/32/64-bit predicates, the implicit PTEST
1409     // performed by the compare could consider fewer lanes for these element
1410     // sizes.
1411     //
1412     // For example, consider
1413     //
1414     //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1415     //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1416     //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1417     //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1418     //                                 ;       ^ last active
1419     //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1420     //                                 ;     ^ last active
1421     //
1422     // where the compare generates a canonical all active 32-bit predicate
1423     // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1424     // active flag, whereas the PTEST instruction with the same mask doesn't.
1425     // For PTEST_ANY this doesn't apply as the flags in this case would be
1426     // identical regardless of element size.
1427     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1428     uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1429     if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1430                                   PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1431       return PredOpcode;
1432 
1433     return {};
1434   }
1435 
1436   // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1437   // opcode so the PTEST becomes redundant.
1438   switch (PredOpcode) {
1439   case AArch64::AND_PPzPP:
1440   case AArch64::BIC_PPzPP:
1441   case AArch64::EOR_PPzPP:
1442   case AArch64::NAND_PPzPP:
1443   case AArch64::NOR_PPzPP:
1444   case AArch64::ORN_PPzPP:
1445   case AArch64::ORR_PPzPP:
1446   case AArch64::BRKA_PPzP:
1447   case AArch64::BRKPA_PPzPP:
1448   case AArch64::BRKB_PPzP:
1449   case AArch64::BRKPB_PPzPP:
1450   case AArch64::RDFFR_PPz: {
1451     // Check to see if our mask is the same. If not the resulting flag bits
1452     // may be different and we can't remove the ptest.
1453     auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1454     if (Mask != PredMask)
1455       return {};
1456     break;
1457   }
1458   case AArch64::BRKN_PPzP: {
1459     // BRKN uses an all active implicit mask to set flags unlike the other
1460     // flag-setting instructions.
1461     // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1462     if ((MaskOpcode != AArch64::PTRUE_B) ||
1463         (Mask->getOperand(1).getImm() != 31))
1464       return {};
1465     break;
1466   }
1467   case AArch64::PTRUE_B:
1468     // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1469     break;
1470   default:
1471     // Bail out if we don't recognize the input
1472     return {};
1473   }
1474 
1475   return convertToFlagSettingOpc(PredOpcode);
1476 }
1477 
1478 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1479 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1480 bool AArch64InstrInfo::optimizePTestInstr(
1481     MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1482     const MachineRegisterInfo *MRI) const {
1483   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1484   auto *Pred = MRI->getUniqueVRegDef(PredReg);
1485   unsigned PredOpcode = Pred->getOpcode();
1486   auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1487   if (!NewOp)
1488     return false;
1489 
1490   const TargetRegisterInfo *TRI = &getRegisterInfo();
1491 
1492   // If another instruction between Pred and PTest accesses flags, don't remove
1493   // the ptest or update the earlier instruction to modify them.
1494   if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1495     return false;
1496 
1497   // If we pass all the checks, it's safe to remove the PTEST and use the flags
1498   // as they are prior to PTEST. Sometimes this requires the tested PTEST
1499   // operand to be replaced with an equivalent instruction that also sets the
1500   // flags.
1501   PTest->eraseFromParent();
1502   if (*NewOp != PredOpcode) {
1503     Pred->setDesc(get(*NewOp));
1504     bool succeeded = UpdateOperandRegClass(*Pred);
1505     (void)succeeded;
1506     assert(succeeded && "Operands have incompatible register classes!");
1507     Pred->addRegisterDefined(AArch64::NZCV, TRI);
1508   }
1509 
1510   // Ensure that the flags def is live.
1511   if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1512     unsigned i = 0, e = Pred->getNumOperands();
1513     for (; i != e; ++i) {
1514       MachineOperand &MO = Pred->getOperand(i);
1515       if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1516         MO.setIsDead(false);
1517         break;
1518       }
1519     }
1520   }
1521   return true;
1522 }
1523 
1524 /// Try to optimize a compare instruction. A compare instruction is an
1525 /// instruction which produces AArch64::NZCV. It can be truly compare
1526 /// instruction
1527 /// when there are no uses of its destination register.
1528 ///
1529 /// The following steps are tried in order:
1530 /// 1. Convert CmpInstr into an unconditional version.
1531 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1532 ///    condition code or an instruction which can be converted into such an
1533 ///    instruction.
1534 ///    Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1535 bool AArch64InstrInfo::optimizeCompareInstr(
1536     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1537     int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1538   assert(CmpInstr.getParent());
1539   assert(MRI);
1540 
1541   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1542   int DeadNZCVIdx =
1543       CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1544   if (DeadNZCVIdx != -1) {
1545     if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1546         CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1547       CmpInstr.eraseFromParent();
1548       return true;
1549     }
1550     unsigned Opc = CmpInstr.getOpcode();
1551     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1552     if (NewOpc == Opc)
1553       return false;
1554     const MCInstrDesc &MCID = get(NewOpc);
1555     CmpInstr.setDesc(MCID);
1556     CmpInstr.removeOperand(DeadNZCVIdx);
1557     bool succeeded = UpdateOperandRegClass(CmpInstr);
1558     (void)succeeded;
1559     assert(succeeded && "Some operands reg class are incompatible!");
1560     return true;
1561   }
1562 
1563   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1564       CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1565     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1566 
1567   if (SrcReg2 != 0)
1568     return false;
1569 
1570   // CmpInstr is a Compare instruction if destination register is not used.
1571   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1572     return false;
1573 
1574   if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1575     return true;
1576   return (CmpValue == 0 || CmpValue == 1) &&
1577          removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1578 }
1579 
1580 /// Get opcode of S version of Instr.
1581 /// If Instr is S version its opcode is returned.
1582 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1583 /// or we are not interested in it.
sForm(MachineInstr & Instr)1584 static unsigned sForm(MachineInstr &Instr) {
1585   switch (Instr.getOpcode()) {
1586   default:
1587     return AArch64::INSTRUCTION_LIST_END;
1588 
1589   case AArch64::ADDSWrr:
1590   case AArch64::ADDSWri:
1591   case AArch64::ADDSXrr:
1592   case AArch64::ADDSXri:
1593   case AArch64::SUBSWrr:
1594   case AArch64::SUBSWri:
1595   case AArch64::SUBSXrr:
1596   case AArch64::SUBSXri:
1597     return Instr.getOpcode();
1598 
1599   case AArch64::ADDWrr:
1600     return AArch64::ADDSWrr;
1601   case AArch64::ADDWri:
1602     return AArch64::ADDSWri;
1603   case AArch64::ADDXrr:
1604     return AArch64::ADDSXrr;
1605   case AArch64::ADDXri:
1606     return AArch64::ADDSXri;
1607   case AArch64::ADCWr:
1608     return AArch64::ADCSWr;
1609   case AArch64::ADCXr:
1610     return AArch64::ADCSXr;
1611   case AArch64::SUBWrr:
1612     return AArch64::SUBSWrr;
1613   case AArch64::SUBWri:
1614     return AArch64::SUBSWri;
1615   case AArch64::SUBXrr:
1616     return AArch64::SUBSXrr;
1617   case AArch64::SUBXri:
1618     return AArch64::SUBSXri;
1619   case AArch64::SBCWr:
1620     return AArch64::SBCSWr;
1621   case AArch64::SBCXr:
1622     return AArch64::SBCSXr;
1623   case AArch64::ANDWri:
1624     return AArch64::ANDSWri;
1625   case AArch64::ANDXri:
1626     return AArch64::ANDSXri;
1627   }
1628 }
1629 
1630 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1631 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1632   for (auto *BB : MBB->successors())
1633     if (BB->isLiveIn(AArch64::NZCV))
1634       return true;
1635   return false;
1636 }
1637 
1638 /// \returns The condition code operand index for \p Instr if it is a branch
1639 /// or select and -1 otherwise.
1640 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1641 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1642   switch (Instr.getOpcode()) {
1643   default:
1644     return -1;
1645 
1646   case AArch64::Bcc: {
1647     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1648     assert(Idx >= 2);
1649     return Idx - 2;
1650   }
1651 
1652   case AArch64::CSINVWr:
1653   case AArch64::CSINVXr:
1654   case AArch64::CSINCWr:
1655   case AArch64::CSINCXr:
1656   case AArch64::CSELWr:
1657   case AArch64::CSELXr:
1658   case AArch64::CSNEGWr:
1659   case AArch64::CSNEGXr:
1660   case AArch64::FCSELSrrr:
1661   case AArch64::FCSELDrrr: {
1662     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1663     assert(Idx >= 1);
1664     return Idx - 1;
1665   }
1666   }
1667 }
1668 
1669 /// Find a condition code used by the instruction.
1670 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1671 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1672 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1673   int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1674   return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1675                           Instr.getOperand(CCIdx).getImm())
1676                     : AArch64CC::Invalid;
1677 }
1678 
getUsedNZCV(AArch64CC::CondCode CC)1679 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1680   assert(CC != AArch64CC::Invalid);
1681   UsedNZCV UsedFlags;
1682   switch (CC) {
1683   default:
1684     break;
1685 
1686   case AArch64CC::EQ: // Z set
1687   case AArch64CC::NE: // Z clear
1688     UsedFlags.Z = true;
1689     break;
1690 
1691   case AArch64CC::HI: // Z clear and C set
1692   case AArch64CC::LS: // Z set   or  C clear
1693     UsedFlags.Z = true;
1694     [[fallthrough]];
1695   case AArch64CC::HS: // C set
1696   case AArch64CC::LO: // C clear
1697     UsedFlags.C = true;
1698     break;
1699 
1700   case AArch64CC::MI: // N set
1701   case AArch64CC::PL: // N clear
1702     UsedFlags.N = true;
1703     break;
1704 
1705   case AArch64CC::VS: // V set
1706   case AArch64CC::VC: // V clear
1707     UsedFlags.V = true;
1708     break;
1709 
1710   case AArch64CC::GT: // Z clear, N and V the same
1711   case AArch64CC::LE: // Z set,   N and V differ
1712     UsedFlags.Z = true;
1713     [[fallthrough]];
1714   case AArch64CC::GE: // N and V the same
1715   case AArch64CC::LT: // N and V differ
1716     UsedFlags.N = true;
1717     UsedFlags.V = true;
1718     break;
1719   }
1720   return UsedFlags;
1721 }
1722 
1723 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1724 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1725 /// \returns std::nullopt otherwise.
1726 ///
1727 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1728 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1729 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1730                        const TargetRegisterInfo &TRI,
1731                        SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1732   MachineBasicBlock *CmpParent = CmpInstr.getParent();
1733   if (MI.getParent() != CmpParent)
1734     return std::nullopt;
1735 
1736   if (areCFlagsAliveInSuccessors(CmpParent))
1737     return std::nullopt;
1738 
1739   UsedNZCV NZCVUsedAfterCmp;
1740   for (MachineInstr &Instr : instructionsWithoutDebug(
1741            std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1742     if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1743       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1744       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1745         return std::nullopt;
1746       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1747       if (CCUseInstrs)
1748         CCUseInstrs->push_back(&Instr);
1749     }
1750     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1751       break;
1752   }
1753   return NZCVUsedAfterCmp;
1754 }
1755 
isADDSRegImm(unsigned Opcode)1756 static bool isADDSRegImm(unsigned Opcode) {
1757   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1758 }
1759 
isSUBSRegImm(unsigned Opcode)1760 static bool isSUBSRegImm(unsigned Opcode) {
1761   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1762 }
1763 
1764 /// Check if CmpInstr can be substituted by MI.
1765 ///
1766 /// CmpInstr can be substituted:
1767 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1768 /// - and, MI and CmpInstr are from the same MachineBB
1769 /// - and, condition flags are not alive in successors of the CmpInstr parent
1770 /// - and, if MI opcode is the S form there must be no defs of flags between
1771 ///        MI and CmpInstr
1772 ///        or if MI opcode is not the S form there must be neither defs of flags
1773 ///        nor uses of flags between MI and CmpInstr.
1774 /// - and, if C/V flags are not used after CmpInstr
1775 ///        or if N flag is used but MI produces poison value if signed overflow
1776 ///        occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1777 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1778                                        const TargetRegisterInfo &TRI) {
1779   // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1780   // that may or may not set flags.
1781   assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1782 
1783   const unsigned CmpOpcode = CmpInstr.getOpcode();
1784   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1785     return false;
1786 
1787   assert((CmpInstr.getOperand(2).isImm() &&
1788           CmpInstr.getOperand(2).getImm() == 0) &&
1789          "Caller guarantees that CmpInstr compares with constant 0");
1790 
1791   std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1792   if (!NZVCUsed || NZVCUsed->C)
1793     return false;
1794 
1795   // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1796   // '%vreg = add ...' or '%vreg = sub ...'.
1797   // Condition flag V is used to indicate signed overflow.
1798   // 1) MI and CmpInstr set N and V to the same value.
1799   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1800   //    signed overflow occurs, so CmpInstr could still be simplified away.
1801   if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1802     return false;
1803 
1804   AccessKind AccessToCheck = AK_Write;
1805   if (sForm(MI) != MI.getOpcode())
1806     AccessToCheck = AK_All;
1807   return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1808 }
1809 
1810 /// Substitute an instruction comparing to zero with another instruction
1811 /// which produces needed condition flags.
1812 ///
1813 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1814 bool AArch64InstrInfo::substituteCmpToZero(
1815     MachineInstr &CmpInstr, unsigned SrcReg,
1816     const MachineRegisterInfo &MRI) const {
1817   // Get the unique definition of SrcReg.
1818   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1819   if (!MI)
1820     return false;
1821 
1822   const TargetRegisterInfo &TRI = getRegisterInfo();
1823 
1824   unsigned NewOpc = sForm(*MI);
1825   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1826     return false;
1827 
1828   if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1829     return false;
1830 
1831   // Update the instruction to set NZCV.
1832   MI->setDesc(get(NewOpc));
1833   CmpInstr.eraseFromParent();
1834   bool succeeded = UpdateOperandRegClass(*MI);
1835   (void)succeeded;
1836   assert(succeeded && "Some operands reg class are incompatible!");
1837   MI->addRegisterDefined(AArch64::NZCV, &TRI);
1838   return true;
1839 }
1840 
1841 /// \returns True if \p CmpInstr can be removed.
1842 ///
1843 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1844 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1845 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1846                                  int CmpValue, const TargetRegisterInfo &TRI,
1847                                  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1848                                  bool &IsInvertCC) {
1849   assert((CmpValue == 0 || CmpValue == 1) &&
1850          "Only comparisons to 0 or 1 considered for removal!");
1851 
1852   // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1853   unsigned MIOpc = MI.getOpcode();
1854   if (MIOpc == AArch64::CSINCWr) {
1855     if (MI.getOperand(1).getReg() != AArch64::WZR ||
1856         MI.getOperand(2).getReg() != AArch64::WZR)
1857       return false;
1858   } else if (MIOpc == AArch64::CSINCXr) {
1859     if (MI.getOperand(1).getReg() != AArch64::XZR ||
1860         MI.getOperand(2).getReg() != AArch64::XZR)
1861       return false;
1862   } else {
1863     return false;
1864   }
1865   AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1866   if (MICC == AArch64CC::Invalid)
1867     return false;
1868 
1869   // NZCV needs to be defined
1870   if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1871     return false;
1872 
1873   // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1874   const unsigned CmpOpcode = CmpInstr.getOpcode();
1875   bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1876   if (CmpValue && !IsSubsRegImm)
1877     return false;
1878   if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1879     return false;
1880 
1881   // MI conditions allowed: eq, ne, mi, pl
1882   UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1883   if (MIUsedNZCV.C || MIUsedNZCV.V)
1884     return false;
1885 
1886   std::optional<UsedNZCV> NZCVUsedAfterCmp =
1887       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1888   // Condition flags are not used in CmpInstr basic block successors and only
1889   // Z or N flags allowed to be used after CmpInstr within its basic block
1890   if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1891     return false;
1892   // Z or N flag used after CmpInstr must correspond to the flag used in MI
1893   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1894       (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1895     return false;
1896   // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1897   if (MIUsedNZCV.N && !CmpValue)
1898     return false;
1899 
1900   // There must be no defs of flags between MI and CmpInstr
1901   if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1902     return false;
1903 
1904   // Condition code is inverted in the following cases:
1905   // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1906   // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1907   IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1908                (!CmpValue && MICC == AArch64CC::NE);
1909   return true;
1910 }
1911 
1912 /// Remove comparison in csinc-cmp sequence
1913 ///
1914 /// Examples:
1915 /// 1. \code
1916 ///   csinc w9, wzr, wzr, ne
1917 ///   cmp   w9, #0
1918 ///   b.eq
1919 ///    \endcode
1920 /// to
1921 ///    \code
1922 ///   csinc w9, wzr, wzr, ne
1923 ///   b.ne
1924 ///    \endcode
1925 ///
1926 /// 2. \code
1927 ///   csinc x2, xzr, xzr, mi
1928 ///   cmp   x2, #1
1929 ///   b.pl
1930 ///    \endcode
1931 /// to
1932 ///    \code
1933 ///   csinc x2, xzr, xzr, mi
1934 ///   b.pl
1935 ///    \endcode
1936 ///
1937 /// \param  CmpInstr comparison instruction
1938 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const1939 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1940     MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1941     const MachineRegisterInfo &MRI) const {
1942   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1943   if (!MI)
1944     return false;
1945   const TargetRegisterInfo &TRI = getRegisterInfo();
1946   SmallVector<MachineInstr *, 4> CCUseInstrs;
1947   bool IsInvertCC = false;
1948   if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1949                             IsInvertCC))
1950     return false;
1951   // Make transformation
1952   CmpInstr.eraseFromParent();
1953   if (IsInvertCC) {
1954     // Invert condition codes in CmpInstr CC users
1955     for (MachineInstr *CCUseInstr : CCUseInstrs) {
1956       int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1957       assert(Idx >= 0 && "Unexpected instruction using CC.");
1958       MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1959       AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1960           static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1961       CCOperand.setImm(CCUse);
1962     }
1963   }
1964   return true;
1965 }
1966 
expandPostRAPseudo(MachineInstr & MI) const1967 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1968   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1969       MI.getOpcode() != AArch64::CATCHRET)
1970     return false;
1971 
1972   MachineBasicBlock &MBB = *MI.getParent();
1973   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1974   auto TRI = Subtarget.getRegisterInfo();
1975   DebugLoc DL = MI.getDebugLoc();
1976 
1977   if (MI.getOpcode() == AArch64::CATCHRET) {
1978     // Skip to the first instruction before the epilog.
1979     const TargetInstrInfo *TII =
1980       MBB.getParent()->getSubtarget().getInstrInfo();
1981     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1982     auto MBBI = MachineBasicBlock::iterator(MI);
1983     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1984     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1985            FirstEpilogSEH != MBB.begin())
1986       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1987     if (FirstEpilogSEH != MBB.begin())
1988       FirstEpilogSEH = std::next(FirstEpilogSEH);
1989     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1990         .addReg(AArch64::X0, RegState::Define)
1991         .addMBB(TargetMBB);
1992     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1993         .addReg(AArch64::X0, RegState::Define)
1994         .addReg(AArch64::X0)
1995         .addMBB(TargetMBB)
1996         .addImm(0);
1997     return true;
1998   }
1999 
2000   Register Reg = MI.getOperand(0).getReg();
2001   Module &M = *MBB.getParent()->getFunction().getParent();
2002   if (M.getStackProtectorGuard() == "sysreg") {
2003     const AArch64SysReg::SysReg *SrcReg =
2004         AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2005     if (!SrcReg)
2006       report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2007 
2008     // mrs xN, sysreg
2009     BuildMI(MBB, MI, DL, get(AArch64::MRS))
2010         .addDef(Reg, RegState::Renamable)
2011         .addImm(SrcReg->Encoding);
2012     int Offset = M.getStackProtectorGuardOffset();
2013     if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2014       // ldr xN, [xN, #offset]
2015       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2016           .addDef(Reg)
2017           .addUse(Reg, RegState::Kill)
2018           .addImm(Offset / 8);
2019     } else if (Offset >= -256 && Offset <= 255) {
2020       // ldur xN, [xN, #offset]
2021       BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2022           .addDef(Reg)
2023           .addUse(Reg, RegState::Kill)
2024           .addImm(Offset);
2025     } else if (Offset >= -4095 && Offset <= 4095) {
2026       if (Offset > 0) {
2027         // add xN, xN, #offset
2028         BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2029             .addDef(Reg)
2030             .addUse(Reg, RegState::Kill)
2031             .addImm(Offset)
2032             .addImm(0);
2033       } else {
2034         // sub xN, xN, #offset
2035         BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2036             .addDef(Reg)
2037             .addUse(Reg, RegState::Kill)
2038             .addImm(-Offset)
2039             .addImm(0);
2040       }
2041       // ldr xN, [xN]
2042       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2043           .addDef(Reg)
2044           .addUse(Reg, RegState::Kill)
2045           .addImm(0);
2046     } else {
2047       // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2048       // than 23760.
2049       // It might be nice to use AArch64::MOVi32imm here, which would get
2050       // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2051       // contains the MRS result. findScratchNonCalleeSaveRegister() in
2052       // AArch64FrameLowering might help us find such a scratch register
2053       // though. If we failed to find a scratch register, we could emit a
2054       // stream of add instructions to build up the immediate. Or, we could try
2055       // to insert a AArch64::MOVi32imm before register allocation so that we
2056       // didn't need to scavenge for a scratch register.
2057       report_fatal_error("Unable to encode Stack Protector Guard Offset");
2058     }
2059     MBB.erase(MI);
2060     return true;
2061   }
2062 
2063   const GlobalValue *GV =
2064       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2065   const TargetMachine &TM = MBB.getParent()->getTarget();
2066   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2067   const unsigned char MO_NC = AArch64II::MO_NC;
2068 
2069   if ((OpFlags & AArch64II::MO_GOT) != 0) {
2070     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2071         .addGlobalAddress(GV, 0, OpFlags);
2072     if (Subtarget.isTargetILP32()) {
2073       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2074       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2075           .addDef(Reg32, RegState::Dead)
2076           .addUse(Reg, RegState::Kill)
2077           .addImm(0)
2078           .addMemOperand(*MI.memoperands_begin())
2079           .addDef(Reg, RegState::Implicit);
2080     } else {
2081       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2082           .addReg(Reg, RegState::Kill)
2083           .addImm(0)
2084           .addMemOperand(*MI.memoperands_begin());
2085     }
2086   } else if (TM.getCodeModel() == CodeModel::Large) {
2087     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2088     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2089         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2090         .addImm(0);
2091     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2092         .addReg(Reg, RegState::Kill)
2093         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2094         .addImm(16);
2095     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2096         .addReg(Reg, RegState::Kill)
2097         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2098         .addImm(32);
2099     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2100         .addReg(Reg, RegState::Kill)
2101         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2102         .addImm(48);
2103     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2104         .addReg(Reg, RegState::Kill)
2105         .addImm(0)
2106         .addMemOperand(*MI.memoperands_begin());
2107   } else if (TM.getCodeModel() == CodeModel::Tiny) {
2108     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2109         .addGlobalAddress(GV, 0, OpFlags);
2110   } else {
2111     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2112         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2113     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2114     if (Subtarget.isTargetILP32()) {
2115       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2116       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2117           .addDef(Reg32, RegState::Dead)
2118           .addUse(Reg, RegState::Kill)
2119           .addGlobalAddress(GV, 0, LoFlags)
2120           .addMemOperand(*MI.memoperands_begin())
2121           .addDef(Reg, RegState::Implicit);
2122     } else {
2123       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2124           .addReg(Reg, RegState::Kill)
2125           .addGlobalAddress(GV, 0, LoFlags)
2126           .addMemOperand(*MI.memoperands_begin());
2127     }
2128   }
2129 
2130   MBB.erase(MI);
2131 
2132   return true;
2133 }
2134 
2135 // Return true if this instruction simply sets its single destination register
2136 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2137 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2138   switch (MI.getOpcode()) {
2139   default:
2140     break;
2141   case AArch64::MOVZWi:
2142   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2143     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2144       assert(MI.getDesc().getNumOperands() == 3 &&
2145              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2146       return true;
2147     }
2148     break;
2149   case AArch64::ANDWri: // and Rd, Rzr, #imm
2150     return MI.getOperand(1).getReg() == AArch64::WZR;
2151   case AArch64::ANDXri:
2152     return MI.getOperand(1).getReg() == AArch64::XZR;
2153   case TargetOpcode::COPY:
2154     return MI.getOperand(1).getReg() == AArch64::WZR;
2155   }
2156   return false;
2157 }
2158 
2159 // Return true if this instruction simply renames a general register without
2160 // modifying bits.
isGPRCopy(const MachineInstr & MI)2161 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2162   switch (MI.getOpcode()) {
2163   default:
2164     break;
2165   case TargetOpcode::COPY: {
2166     // GPR32 copies will by lowered to ORRXrs
2167     Register DstReg = MI.getOperand(0).getReg();
2168     return (AArch64::GPR32RegClass.contains(DstReg) ||
2169             AArch64::GPR64RegClass.contains(DstReg));
2170   }
2171   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2172     if (MI.getOperand(1).getReg() == AArch64::XZR) {
2173       assert(MI.getDesc().getNumOperands() == 4 &&
2174              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2175       return true;
2176     }
2177     break;
2178   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2179     if (MI.getOperand(2).getImm() == 0) {
2180       assert(MI.getDesc().getNumOperands() == 4 &&
2181              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2182       return true;
2183     }
2184     break;
2185   }
2186   return false;
2187 }
2188 
2189 // Return true if this instruction simply renames a general register without
2190 // modifying bits.
isFPRCopy(const MachineInstr & MI)2191 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2192   switch (MI.getOpcode()) {
2193   default:
2194     break;
2195   case TargetOpcode::COPY: {
2196     Register DstReg = MI.getOperand(0).getReg();
2197     return AArch64::FPR128RegClass.contains(DstReg);
2198   }
2199   case AArch64::ORRv16i8:
2200     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2201       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2202              "invalid ORRv16i8 operands");
2203       return true;
2204     }
2205     break;
2206   }
2207   return false;
2208 }
2209 
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2210 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2211                                                int &FrameIndex) const {
2212   switch (MI.getOpcode()) {
2213   default:
2214     break;
2215   case AArch64::LDRWui:
2216   case AArch64::LDRXui:
2217   case AArch64::LDRBui:
2218   case AArch64::LDRHui:
2219   case AArch64::LDRSui:
2220   case AArch64::LDRDui:
2221   case AArch64::LDRQui:
2222   case AArch64::LDR_PXI:
2223     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2224         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2225       FrameIndex = MI.getOperand(1).getIndex();
2226       return MI.getOperand(0).getReg();
2227     }
2228     break;
2229   }
2230 
2231   return 0;
2232 }
2233 
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2234 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2235                                               int &FrameIndex) const {
2236   switch (MI.getOpcode()) {
2237   default:
2238     break;
2239   case AArch64::STRWui:
2240   case AArch64::STRXui:
2241   case AArch64::STRBui:
2242   case AArch64::STRHui:
2243   case AArch64::STRSui:
2244   case AArch64::STRDui:
2245   case AArch64::STRQui:
2246   case AArch64::STR_PXI:
2247     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2248         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2249       FrameIndex = MI.getOperand(1).getIndex();
2250       return MI.getOperand(0).getReg();
2251     }
2252     break;
2253   }
2254   return 0;
2255 }
2256 
2257 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2258 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2259   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2260     return MMO->getFlags() & MOSuppressPair;
2261   });
2262 }
2263 
2264 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2265 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2266   if (MI.memoperands_empty())
2267     return;
2268   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2269 }
2270 
2271 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2272 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2273   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2274     return MMO->getFlags() & MOStridedAccess;
2275   });
2276 }
2277 
hasUnscaledLdStOffset(unsigned Opc)2278 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2279   switch (Opc) {
2280   default:
2281     return false;
2282   case AArch64::STURSi:
2283   case AArch64::STRSpre:
2284   case AArch64::STURDi:
2285   case AArch64::STRDpre:
2286   case AArch64::STURQi:
2287   case AArch64::STRQpre:
2288   case AArch64::STURBBi:
2289   case AArch64::STURHHi:
2290   case AArch64::STURWi:
2291   case AArch64::STRWpre:
2292   case AArch64::STURXi:
2293   case AArch64::STRXpre:
2294   case AArch64::LDURSi:
2295   case AArch64::LDRSpre:
2296   case AArch64::LDURDi:
2297   case AArch64::LDRDpre:
2298   case AArch64::LDURQi:
2299   case AArch64::LDRQpre:
2300   case AArch64::LDURWi:
2301   case AArch64::LDRWpre:
2302   case AArch64::LDURXi:
2303   case AArch64::LDRXpre:
2304   case AArch64::LDRSWpre:
2305   case AArch64::LDURSWi:
2306   case AArch64::LDURHHi:
2307   case AArch64::LDURBBi:
2308   case AArch64::LDURSBWi:
2309   case AArch64::LDURSHWi:
2310     return true;
2311   }
2312 }
2313 
getUnscaledLdSt(unsigned Opc)2314 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2315   switch (Opc) {
2316   default: return {};
2317   case AArch64::PRFMui: return AArch64::PRFUMi;
2318   case AArch64::LDRXui: return AArch64::LDURXi;
2319   case AArch64::LDRWui: return AArch64::LDURWi;
2320   case AArch64::LDRBui: return AArch64::LDURBi;
2321   case AArch64::LDRHui: return AArch64::LDURHi;
2322   case AArch64::LDRSui: return AArch64::LDURSi;
2323   case AArch64::LDRDui: return AArch64::LDURDi;
2324   case AArch64::LDRQui: return AArch64::LDURQi;
2325   case AArch64::LDRBBui: return AArch64::LDURBBi;
2326   case AArch64::LDRHHui: return AArch64::LDURHHi;
2327   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2328   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2329   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2330   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2331   case AArch64::LDRSWui: return AArch64::LDURSWi;
2332   case AArch64::STRXui: return AArch64::STURXi;
2333   case AArch64::STRWui: return AArch64::STURWi;
2334   case AArch64::STRBui: return AArch64::STURBi;
2335   case AArch64::STRHui: return AArch64::STURHi;
2336   case AArch64::STRSui: return AArch64::STURSi;
2337   case AArch64::STRDui: return AArch64::STURDi;
2338   case AArch64::STRQui: return AArch64::STURQi;
2339   case AArch64::STRBBui: return AArch64::STURBBi;
2340   case AArch64::STRHHui: return AArch64::STURHHi;
2341   }
2342 }
2343 
getLoadStoreImmIdx(unsigned Opc)2344 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2345   switch (Opc) {
2346   default:
2347     return 2;
2348   case AArch64::LDPXi:
2349   case AArch64::LDPDi:
2350   case AArch64::STPXi:
2351   case AArch64::STPDi:
2352   case AArch64::LDNPXi:
2353   case AArch64::LDNPDi:
2354   case AArch64::STNPXi:
2355   case AArch64::STNPDi:
2356   case AArch64::LDPQi:
2357   case AArch64::STPQi:
2358   case AArch64::LDNPQi:
2359   case AArch64::STNPQi:
2360   case AArch64::LDPWi:
2361   case AArch64::LDPSi:
2362   case AArch64::STPWi:
2363   case AArch64::STPSi:
2364   case AArch64::LDNPWi:
2365   case AArch64::LDNPSi:
2366   case AArch64::STNPWi:
2367   case AArch64::STNPSi:
2368   case AArch64::LDG:
2369   case AArch64::STGPi:
2370 
2371   case AArch64::LD1B_IMM:
2372   case AArch64::LD1B_H_IMM:
2373   case AArch64::LD1B_S_IMM:
2374   case AArch64::LD1B_D_IMM:
2375   case AArch64::LD1SB_H_IMM:
2376   case AArch64::LD1SB_S_IMM:
2377   case AArch64::LD1SB_D_IMM:
2378   case AArch64::LD1H_IMM:
2379   case AArch64::LD1H_S_IMM:
2380   case AArch64::LD1H_D_IMM:
2381   case AArch64::LD1SH_S_IMM:
2382   case AArch64::LD1SH_D_IMM:
2383   case AArch64::LD1W_IMM:
2384   case AArch64::LD1W_D_IMM:
2385   case AArch64::LD1SW_D_IMM:
2386   case AArch64::LD1D_IMM:
2387 
2388   case AArch64::LD2B_IMM:
2389   case AArch64::LD2H_IMM:
2390   case AArch64::LD2W_IMM:
2391   case AArch64::LD2D_IMM:
2392   case AArch64::LD3B_IMM:
2393   case AArch64::LD3H_IMM:
2394   case AArch64::LD3W_IMM:
2395   case AArch64::LD3D_IMM:
2396   case AArch64::LD4B_IMM:
2397   case AArch64::LD4H_IMM:
2398   case AArch64::LD4W_IMM:
2399   case AArch64::LD4D_IMM:
2400 
2401   case AArch64::ST1B_IMM:
2402   case AArch64::ST1B_H_IMM:
2403   case AArch64::ST1B_S_IMM:
2404   case AArch64::ST1B_D_IMM:
2405   case AArch64::ST1H_IMM:
2406   case AArch64::ST1H_S_IMM:
2407   case AArch64::ST1H_D_IMM:
2408   case AArch64::ST1W_IMM:
2409   case AArch64::ST1W_D_IMM:
2410   case AArch64::ST1D_IMM:
2411 
2412   case AArch64::ST2B_IMM:
2413   case AArch64::ST2H_IMM:
2414   case AArch64::ST2W_IMM:
2415   case AArch64::ST2D_IMM:
2416   case AArch64::ST3B_IMM:
2417   case AArch64::ST3H_IMM:
2418   case AArch64::ST3W_IMM:
2419   case AArch64::ST3D_IMM:
2420   case AArch64::ST4B_IMM:
2421   case AArch64::ST4H_IMM:
2422   case AArch64::ST4W_IMM:
2423   case AArch64::ST4D_IMM:
2424 
2425   case AArch64::LD1RB_IMM:
2426   case AArch64::LD1RB_H_IMM:
2427   case AArch64::LD1RB_S_IMM:
2428   case AArch64::LD1RB_D_IMM:
2429   case AArch64::LD1RSB_H_IMM:
2430   case AArch64::LD1RSB_S_IMM:
2431   case AArch64::LD1RSB_D_IMM:
2432   case AArch64::LD1RH_IMM:
2433   case AArch64::LD1RH_S_IMM:
2434   case AArch64::LD1RH_D_IMM:
2435   case AArch64::LD1RSH_S_IMM:
2436   case AArch64::LD1RSH_D_IMM:
2437   case AArch64::LD1RW_IMM:
2438   case AArch64::LD1RW_D_IMM:
2439   case AArch64::LD1RSW_IMM:
2440   case AArch64::LD1RD_IMM:
2441 
2442   case AArch64::LDNT1B_ZRI:
2443   case AArch64::LDNT1H_ZRI:
2444   case AArch64::LDNT1W_ZRI:
2445   case AArch64::LDNT1D_ZRI:
2446   case AArch64::STNT1B_ZRI:
2447   case AArch64::STNT1H_ZRI:
2448   case AArch64::STNT1W_ZRI:
2449   case AArch64::STNT1D_ZRI:
2450 
2451   case AArch64::LDNF1B_IMM:
2452   case AArch64::LDNF1B_H_IMM:
2453   case AArch64::LDNF1B_S_IMM:
2454   case AArch64::LDNF1B_D_IMM:
2455   case AArch64::LDNF1SB_H_IMM:
2456   case AArch64::LDNF1SB_S_IMM:
2457   case AArch64::LDNF1SB_D_IMM:
2458   case AArch64::LDNF1H_IMM:
2459   case AArch64::LDNF1H_S_IMM:
2460   case AArch64::LDNF1H_D_IMM:
2461   case AArch64::LDNF1SH_S_IMM:
2462   case AArch64::LDNF1SH_D_IMM:
2463   case AArch64::LDNF1W_IMM:
2464   case AArch64::LDNF1W_D_IMM:
2465   case AArch64::LDNF1SW_D_IMM:
2466   case AArch64::LDNF1D_IMM:
2467     return 3;
2468   case AArch64::ADDG:
2469   case AArch64::STGi:
2470   case AArch64::LDR_PXI:
2471   case AArch64::STR_PXI:
2472     return 2;
2473   }
2474 }
2475 
isPairableLdStInst(const MachineInstr & MI)2476 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2477   switch (MI.getOpcode()) {
2478   default:
2479     return false;
2480   // Scaled instructions.
2481   case AArch64::STRSui:
2482   case AArch64::STRDui:
2483   case AArch64::STRQui:
2484   case AArch64::STRXui:
2485   case AArch64::STRWui:
2486   case AArch64::LDRSui:
2487   case AArch64::LDRDui:
2488   case AArch64::LDRQui:
2489   case AArch64::LDRXui:
2490   case AArch64::LDRWui:
2491   case AArch64::LDRSWui:
2492   // Unscaled instructions.
2493   case AArch64::STURSi:
2494   case AArch64::STRSpre:
2495   case AArch64::STURDi:
2496   case AArch64::STRDpre:
2497   case AArch64::STURQi:
2498   case AArch64::STRQpre:
2499   case AArch64::STURWi:
2500   case AArch64::STRWpre:
2501   case AArch64::STURXi:
2502   case AArch64::STRXpre:
2503   case AArch64::LDURSi:
2504   case AArch64::LDRSpre:
2505   case AArch64::LDURDi:
2506   case AArch64::LDRDpre:
2507   case AArch64::LDURQi:
2508   case AArch64::LDRQpre:
2509   case AArch64::LDURWi:
2510   case AArch64::LDRWpre:
2511   case AArch64::LDURXi:
2512   case AArch64::LDRXpre:
2513   case AArch64::LDURSWi:
2514   case AArch64::LDRSWpre:
2515     return true;
2516   }
2517 }
2518 
isTailCallReturnInst(const MachineInstr & MI)2519 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2520   switch (MI.getOpcode()) {
2521   default:
2522     assert((!MI.isCall() || !MI.isReturn()) &&
2523            "Unexpected instruction - was a new tail call opcode introduced?");
2524     return false;
2525   case AArch64::TCRETURNdi:
2526   case AArch64::TCRETURNri:
2527   case AArch64::TCRETURNrix16x17:
2528   case AArch64::TCRETURNrix17:
2529   case AArch64::TCRETURNrinotx16:
2530   case AArch64::TCRETURNriALL:
2531   case AArch64::AUTH_TCRETURN:
2532   case AArch64::AUTH_TCRETURN_BTI:
2533     return true;
2534   }
2535 }
2536 
convertToFlagSettingOpc(unsigned Opc)2537 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2538   switch (Opc) {
2539   default:
2540     llvm_unreachable("Opcode has no flag setting equivalent!");
2541   // 32-bit cases:
2542   case AArch64::ADDWri:
2543     return AArch64::ADDSWri;
2544   case AArch64::ADDWrr:
2545     return AArch64::ADDSWrr;
2546   case AArch64::ADDWrs:
2547     return AArch64::ADDSWrs;
2548   case AArch64::ADDWrx:
2549     return AArch64::ADDSWrx;
2550   case AArch64::ANDWri:
2551     return AArch64::ANDSWri;
2552   case AArch64::ANDWrr:
2553     return AArch64::ANDSWrr;
2554   case AArch64::ANDWrs:
2555     return AArch64::ANDSWrs;
2556   case AArch64::BICWrr:
2557     return AArch64::BICSWrr;
2558   case AArch64::BICWrs:
2559     return AArch64::BICSWrs;
2560   case AArch64::SUBWri:
2561     return AArch64::SUBSWri;
2562   case AArch64::SUBWrr:
2563     return AArch64::SUBSWrr;
2564   case AArch64::SUBWrs:
2565     return AArch64::SUBSWrs;
2566   case AArch64::SUBWrx:
2567     return AArch64::SUBSWrx;
2568   // 64-bit cases:
2569   case AArch64::ADDXri:
2570     return AArch64::ADDSXri;
2571   case AArch64::ADDXrr:
2572     return AArch64::ADDSXrr;
2573   case AArch64::ADDXrs:
2574     return AArch64::ADDSXrs;
2575   case AArch64::ADDXrx:
2576     return AArch64::ADDSXrx;
2577   case AArch64::ANDXri:
2578     return AArch64::ANDSXri;
2579   case AArch64::ANDXrr:
2580     return AArch64::ANDSXrr;
2581   case AArch64::ANDXrs:
2582     return AArch64::ANDSXrs;
2583   case AArch64::BICXrr:
2584     return AArch64::BICSXrr;
2585   case AArch64::BICXrs:
2586     return AArch64::BICSXrs;
2587   case AArch64::SUBXri:
2588     return AArch64::SUBSXri;
2589   case AArch64::SUBXrr:
2590     return AArch64::SUBSXrr;
2591   case AArch64::SUBXrs:
2592     return AArch64::SUBSXrs;
2593   case AArch64::SUBXrx:
2594     return AArch64::SUBSXrx;
2595   // SVE instructions:
2596   case AArch64::AND_PPzPP:
2597     return AArch64::ANDS_PPzPP;
2598   case AArch64::BIC_PPzPP:
2599     return AArch64::BICS_PPzPP;
2600   case AArch64::EOR_PPzPP:
2601     return AArch64::EORS_PPzPP;
2602   case AArch64::NAND_PPzPP:
2603     return AArch64::NANDS_PPzPP;
2604   case AArch64::NOR_PPzPP:
2605     return AArch64::NORS_PPzPP;
2606   case AArch64::ORN_PPzPP:
2607     return AArch64::ORNS_PPzPP;
2608   case AArch64::ORR_PPzPP:
2609     return AArch64::ORRS_PPzPP;
2610   case AArch64::BRKA_PPzP:
2611     return AArch64::BRKAS_PPzP;
2612   case AArch64::BRKPA_PPzPP:
2613     return AArch64::BRKPAS_PPzPP;
2614   case AArch64::BRKB_PPzP:
2615     return AArch64::BRKBS_PPzP;
2616   case AArch64::BRKPB_PPzPP:
2617     return AArch64::BRKPBS_PPzPP;
2618   case AArch64::BRKN_PPzP:
2619     return AArch64::BRKNS_PPzP;
2620   case AArch64::RDFFR_PPz:
2621     return AArch64::RDFFRS_PPz;
2622   case AArch64::PTRUE_B:
2623     return AArch64::PTRUES_B;
2624   }
2625 }
2626 
2627 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2628 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2629 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2630 
2631   bool IsPreLdSt = isPreLdSt(MI);
2632 
2633   // If this is a volatile load/store, don't mess with it.
2634   if (MI.hasOrderedMemoryRef())
2635     return false;
2636 
2637   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2638   // For Pre-inc LD/ST, the operand is shifted by one.
2639   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2640           MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2641          "Expected a reg or frame index operand.");
2642 
2643   // For Pre-indexed addressing quadword instructions, the third operand is the
2644   // immediate value.
2645   bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2646 
2647   if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2648     return false;
2649 
2650   // Can't merge/pair if the instruction modifies the base register.
2651   // e.g., ldr x0, [x0]
2652   // This case will never occur with an FI base.
2653   // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2654   // STR<S,D,Q,W,X>pre, it can be merged.
2655   // For example:
2656   //   ldr q0, [x11, #32]!
2657   //   ldr q1, [x11, #16]
2658   //   to
2659   //   ldp q0, q1, [x11, #32]!
2660   if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2661     Register BaseReg = MI.getOperand(1).getReg();
2662     const TargetRegisterInfo *TRI = &getRegisterInfo();
2663     if (MI.modifiesRegister(BaseReg, TRI))
2664       return false;
2665   }
2666 
2667   // Check if this load/store has a hint to avoid pair formation.
2668   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2669   if (isLdStPairSuppressed(MI))
2670     return false;
2671 
2672   // Do not pair any callee-save store/reload instructions in the
2673   // prologue/epilogue if the CFI information encoded the operations as separate
2674   // instructions, as that will cause the size of the actual prologue to mismatch
2675   // with the prologue size recorded in the Windows CFI.
2676   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2677   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2678                      MI.getMF()->getFunction().needsUnwindTableEntry();
2679   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2680                       MI.getFlag(MachineInstr::FrameDestroy)))
2681     return false;
2682 
2683   // On some CPUs quad load/store pairs are slower than two single load/stores.
2684   if (Subtarget.isPaired128Slow()) {
2685     switch (MI.getOpcode()) {
2686     default:
2687       break;
2688     case AArch64::LDURQi:
2689     case AArch64::STURQi:
2690     case AArch64::LDRQui:
2691     case AArch64::STRQui:
2692       return false;
2693     }
2694   }
2695 
2696   return true;
2697 }
2698 
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,LocationSize & Width,const TargetRegisterInfo * TRI) const2699 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2700     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2701     int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2702     const TargetRegisterInfo *TRI) const {
2703   if (!LdSt.mayLoadOrStore())
2704     return false;
2705 
2706   const MachineOperand *BaseOp;
2707   TypeSize WidthN(0, false);
2708   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2709                                     WidthN, TRI))
2710     return false;
2711   // The maximum vscale is 16 under AArch64, return the maximal extent for the
2712   // vector.
2713   Width = LocationSize::precise(WidthN);
2714   BaseOps.push_back(BaseOp);
2715   return true;
2716 }
2717 
2718 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2719 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2720                                           const TargetRegisterInfo *TRI) const {
2721   const MachineOperand *Base; // Filled with the base operand of MI.
2722   int64_t Offset;             // Filled with the offset of MI.
2723   bool OffsetIsScalable;
2724   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2725     return std::nullopt;
2726 
2727   if (!Base->isReg())
2728     return std::nullopt;
2729   ExtAddrMode AM;
2730   AM.BaseReg = Base->getReg();
2731   AM.Displacement = Offset;
2732   AM.ScaledReg = 0;
2733   AM.Scale = 0;
2734   return AM;
2735 }
2736 
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const2737 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2738                                            Register Reg,
2739                                            const MachineInstr &AddrI,
2740                                            ExtAddrMode &AM) const {
2741   // Filter out instructions into which we cannot fold.
2742   unsigned NumBytes;
2743   int64_t OffsetScale = 1;
2744   switch (MemI.getOpcode()) {
2745   default:
2746     return false;
2747 
2748   case AArch64::LDURQi:
2749   case AArch64::STURQi:
2750     NumBytes = 16;
2751     break;
2752 
2753   case AArch64::LDURDi:
2754   case AArch64::STURDi:
2755   case AArch64::LDURXi:
2756   case AArch64::STURXi:
2757     NumBytes = 8;
2758     break;
2759 
2760   case AArch64::LDURWi:
2761   case AArch64::LDURSWi:
2762   case AArch64::STURWi:
2763     NumBytes = 4;
2764     break;
2765 
2766   case AArch64::LDURHi:
2767   case AArch64::STURHi:
2768   case AArch64::LDURHHi:
2769   case AArch64::STURHHi:
2770   case AArch64::LDURSHXi:
2771   case AArch64::LDURSHWi:
2772     NumBytes = 2;
2773     break;
2774 
2775   case AArch64::LDRBroX:
2776   case AArch64::LDRBBroX:
2777   case AArch64::LDRSBXroX:
2778   case AArch64::LDRSBWroX:
2779   case AArch64::STRBroX:
2780   case AArch64::STRBBroX:
2781   case AArch64::LDURBi:
2782   case AArch64::LDURBBi:
2783   case AArch64::LDURSBXi:
2784   case AArch64::LDURSBWi:
2785   case AArch64::STURBi:
2786   case AArch64::STURBBi:
2787   case AArch64::LDRBui:
2788   case AArch64::LDRBBui:
2789   case AArch64::LDRSBXui:
2790   case AArch64::LDRSBWui:
2791   case AArch64::STRBui:
2792   case AArch64::STRBBui:
2793     NumBytes = 1;
2794     break;
2795 
2796   case AArch64::LDRQroX:
2797   case AArch64::STRQroX:
2798   case AArch64::LDRQui:
2799   case AArch64::STRQui:
2800     NumBytes = 16;
2801     OffsetScale = 16;
2802     break;
2803 
2804   case AArch64::LDRDroX:
2805   case AArch64::STRDroX:
2806   case AArch64::LDRXroX:
2807   case AArch64::STRXroX:
2808   case AArch64::LDRDui:
2809   case AArch64::STRDui:
2810   case AArch64::LDRXui:
2811   case AArch64::STRXui:
2812     NumBytes = 8;
2813     OffsetScale = 8;
2814     break;
2815 
2816   case AArch64::LDRWroX:
2817   case AArch64::LDRSWroX:
2818   case AArch64::STRWroX:
2819   case AArch64::LDRWui:
2820   case AArch64::LDRSWui:
2821   case AArch64::STRWui:
2822     NumBytes = 4;
2823     OffsetScale = 4;
2824     break;
2825 
2826   case AArch64::LDRHroX:
2827   case AArch64::STRHroX:
2828   case AArch64::LDRHHroX:
2829   case AArch64::STRHHroX:
2830   case AArch64::LDRSHXroX:
2831   case AArch64::LDRSHWroX:
2832   case AArch64::LDRHui:
2833   case AArch64::STRHui:
2834   case AArch64::LDRHHui:
2835   case AArch64::STRHHui:
2836   case AArch64::LDRSHXui:
2837   case AArch64::LDRSHWui:
2838     NumBytes = 2;
2839     OffsetScale = 2;
2840     break;
2841   }
2842 
2843   // Check the fold operand is not the loaded/stored value.
2844   const MachineOperand &BaseRegOp = MemI.getOperand(0);
2845   if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2846     return false;
2847 
2848   // Handle memory instructions with a [Reg, Reg] addressing mode.
2849   if (MemI.getOperand(2).isReg()) {
2850     // Bail if the addressing mode already includes extension of the offset
2851     // register.
2852     if (MemI.getOperand(3).getImm())
2853       return false;
2854 
2855     // Check if we actually have a scaled offset.
2856     if (MemI.getOperand(4).getImm() == 0)
2857       OffsetScale = 1;
2858 
2859     // If the address instructions is folded into the base register, then the
2860     // addressing mode must not have a scale. Then we can swap the base and the
2861     // scaled registers.
2862     if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2863       return false;
2864 
2865     switch (AddrI.getOpcode()) {
2866     default:
2867       return false;
2868 
2869     case AArch64::SBFMXri:
2870       // sxtw Xa, Wm
2871       // ldr Xd, [Xn, Xa, lsl #N]
2872       // ->
2873       // ldr Xd, [Xn, Wm, sxtw #N]
2874       if (AddrI.getOperand(2).getImm() != 0 ||
2875           AddrI.getOperand(3).getImm() != 31)
2876         return false;
2877 
2878       AM.BaseReg = MemI.getOperand(1).getReg();
2879       if (AM.BaseReg == Reg)
2880         AM.BaseReg = MemI.getOperand(2).getReg();
2881       AM.ScaledReg = AddrI.getOperand(1).getReg();
2882       AM.Scale = OffsetScale;
2883       AM.Displacement = 0;
2884       AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2885       return true;
2886 
2887     case TargetOpcode::SUBREG_TO_REG: {
2888       // mov Wa, Wm
2889       // ldr Xd, [Xn, Xa, lsl #N]
2890       // ->
2891       // ldr Xd, [Xn, Wm, uxtw #N]
2892 
2893       // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2894       if (AddrI.getOperand(1).getImm() != 0 ||
2895           AddrI.getOperand(3).getImm() != AArch64::sub_32)
2896         return false;
2897 
2898       const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2899       Register OffsetReg = AddrI.getOperand(2).getReg();
2900       if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2901         return false;
2902 
2903       const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2904       if (DefMI.getOpcode() != AArch64::ORRWrs ||
2905           DefMI.getOperand(1).getReg() != AArch64::WZR ||
2906           DefMI.getOperand(3).getImm() != 0)
2907         return false;
2908 
2909       AM.BaseReg = MemI.getOperand(1).getReg();
2910       if (AM.BaseReg == Reg)
2911         AM.BaseReg = MemI.getOperand(2).getReg();
2912       AM.ScaledReg = DefMI.getOperand(2).getReg();
2913       AM.Scale = OffsetScale;
2914       AM.Displacement = 0;
2915       AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2916       return true;
2917     }
2918     }
2919   }
2920 
2921   // Handle memory instructions with a [Reg, #Imm] addressing mode.
2922 
2923   // Check we are not breaking a potential conversion to an LDP.
2924   auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2925                                  int64_t NewOffset) -> bool {
2926     int64_t MinOffset, MaxOffset;
2927     switch (NumBytes) {
2928     default:
2929       return true;
2930     case 4:
2931       MinOffset = -256;
2932       MaxOffset = 252;
2933       break;
2934     case 8:
2935       MinOffset = -512;
2936       MaxOffset = 504;
2937       break;
2938     case 16:
2939       MinOffset = -1024;
2940       MaxOffset = 1008;
2941       break;
2942     }
2943     return OldOffset < MinOffset || OldOffset > MaxOffset ||
2944            (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2945   };
2946   auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2947     int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2948     int64_t NewOffset = OldOffset + Disp;
2949     if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2950       return false;
2951     // If the old offset would fit into an LDP, but the new offset wouldn't,
2952     // bail out.
2953     if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2954       return false;
2955     AM.BaseReg = AddrI.getOperand(1).getReg();
2956     AM.ScaledReg = 0;
2957     AM.Scale = 0;
2958     AM.Displacement = NewOffset;
2959     AM.Form = ExtAddrMode::Formula::Basic;
2960     return true;
2961   };
2962 
2963   auto canFoldAddRegIntoAddrMode =
2964       [&](int64_t Scale,
2965           ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2966     if (MemI.getOperand(2).getImm() != 0)
2967       return false;
2968     if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2969       return false;
2970     AM.BaseReg = AddrI.getOperand(1).getReg();
2971     AM.ScaledReg = AddrI.getOperand(2).getReg();
2972     AM.Scale = Scale;
2973     AM.Displacement = 0;
2974     AM.Form = Form;
2975     return true;
2976   };
2977 
2978   auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2979     unsigned Opcode = MemI.getOpcode();
2980     return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2981            Subtarget.isSTRQroSlow();
2982   };
2983 
2984   int64_t Disp = 0;
2985   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2986   switch (AddrI.getOpcode()) {
2987   default:
2988     return false;
2989 
2990   case AArch64::ADDXri:
2991     // add Xa, Xn, #N
2992     // ldr Xd, [Xa, #M]
2993     // ->
2994     // ldr Xd, [Xn, #N'+M]
2995     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2996     return canFoldAddSubImmIntoAddrMode(Disp);
2997 
2998   case AArch64::SUBXri:
2999     // sub Xa, Xn, #N
3000     // ldr Xd, [Xa, #M]
3001     // ->
3002     // ldr Xd, [Xn, #N'+M]
3003     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3004     return canFoldAddSubImmIntoAddrMode(-Disp);
3005 
3006   case AArch64::ADDXrs: {
3007     // add Xa, Xn, Xm, lsl #N
3008     // ldr Xd, [Xa]
3009     // ->
3010     // ldr Xd, [Xn, Xm, lsl #N]
3011 
3012     // Don't fold the add if the result would be slower, unless optimising for
3013     // size.
3014     unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3015     if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3016       return false;
3017     Shift = AArch64_AM::getShiftValue(Shift);
3018     if (!OptSize) {
3019       if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3020         return false;
3021       if (avoidSlowSTRQ(MemI))
3022         return false;
3023     }
3024     return canFoldAddRegIntoAddrMode(1ULL << Shift);
3025   }
3026 
3027   case AArch64::ADDXrr:
3028     // add Xa, Xn, Xm
3029     // ldr Xd, [Xa]
3030     // ->
3031     // ldr Xd, [Xn, Xm, lsl #0]
3032 
3033     // Don't fold the add if the result would be slower, unless optimising for
3034     // size.
3035     if (!OptSize && avoidSlowSTRQ(MemI))
3036       return false;
3037     return canFoldAddRegIntoAddrMode(1);
3038 
3039   case AArch64::ADDXrx:
3040     // add Xa, Xn, Wm, {s,u}xtw #N
3041     // ldr Xd, [Xa]
3042     // ->
3043     // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3044 
3045     // Don't fold the add if the result would be slower, unless optimising for
3046     // size.
3047     if (!OptSize && avoidSlowSTRQ(MemI))
3048       return false;
3049 
3050     // Can fold only sign-/zero-extend of a word.
3051     unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3052     AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3053     if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3054       return false;
3055 
3056     return canFoldAddRegIntoAddrMode(
3057         1ULL << AArch64_AM::getArithShiftValue(Imm),
3058         (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3059                                      : ExtAddrMode::Formula::ZExtScaledReg);
3060   }
3061 }
3062 
3063 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3064 // return the opcode of an instruction performing the same operation, but using
3065 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3066 static unsigned regOffsetOpcode(unsigned Opcode) {
3067   switch (Opcode) {
3068   default:
3069     llvm_unreachable("Address folding not implemented for instruction");
3070 
3071   case AArch64::LDURQi:
3072   case AArch64::LDRQui:
3073     return AArch64::LDRQroX;
3074   case AArch64::STURQi:
3075   case AArch64::STRQui:
3076     return AArch64::STRQroX;
3077   case AArch64::LDURDi:
3078   case AArch64::LDRDui:
3079     return AArch64::LDRDroX;
3080   case AArch64::STURDi:
3081   case AArch64::STRDui:
3082     return AArch64::STRDroX;
3083   case AArch64::LDURXi:
3084   case AArch64::LDRXui:
3085     return AArch64::LDRXroX;
3086   case AArch64::STURXi:
3087   case AArch64::STRXui:
3088     return AArch64::STRXroX;
3089   case AArch64::LDURWi:
3090   case AArch64::LDRWui:
3091     return AArch64::LDRWroX;
3092   case AArch64::LDURSWi:
3093   case AArch64::LDRSWui:
3094     return AArch64::LDRSWroX;
3095   case AArch64::STURWi:
3096   case AArch64::STRWui:
3097     return AArch64::STRWroX;
3098   case AArch64::LDURHi:
3099   case AArch64::LDRHui:
3100     return AArch64::LDRHroX;
3101   case AArch64::STURHi:
3102   case AArch64::STRHui:
3103     return AArch64::STRHroX;
3104   case AArch64::LDURHHi:
3105   case AArch64::LDRHHui:
3106     return AArch64::LDRHHroX;
3107   case AArch64::STURHHi:
3108   case AArch64::STRHHui:
3109     return AArch64::STRHHroX;
3110   case AArch64::LDURSHXi:
3111   case AArch64::LDRSHXui:
3112     return AArch64::LDRSHXroX;
3113   case AArch64::LDURSHWi:
3114   case AArch64::LDRSHWui:
3115     return AArch64::LDRSHWroX;
3116   case AArch64::LDURBi:
3117   case AArch64::LDRBui:
3118     return AArch64::LDRBroX;
3119   case AArch64::LDURBBi:
3120   case AArch64::LDRBBui:
3121     return AArch64::LDRBBroX;
3122   case AArch64::LDURSBXi:
3123   case AArch64::LDRSBXui:
3124     return AArch64::LDRSBXroX;
3125   case AArch64::LDURSBWi:
3126   case AArch64::LDRSBWui:
3127     return AArch64::LDRSBWroX;
3128   case AArch64::STURBi:
3129   case AArch64::STRBui:
3130     return AArch64::STRBroX;
3131   case AArch64::STURBBi:
3132   case AArch64::STRBBui:
3133     return AArch64::STRBBroX;
3134   }
3135 }
3136 
3137 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3138 // the opcode of an instruction performing the same operation, but using the
3139 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3140 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3141   switch (Opcode) {
3142   default:
3143     llvm_unreachable("Address folding not implemented for instruction");
3144 
3145   case AArch64::LDURQi:
3146     Scale = 16;
3147     return AArch64::LDRQui;
3148   case AArch64::STURQi:
3149     Scale = 16;
3150     return AArch64::STRQui;
3151   case AArch64::LDURDi:
3152     Scale = 8;
3153     return AArch64::LDRDui;
3154   case AArch64::STURDi:
3155     Scale = 8;
3156     return AArch64::STRDui;
3157   case AArch64::LDURXi:
3158     Scale = 8;
3159     return AArch64::LDRXui;
3160   case AArch64::STURXi:
3161     Scale = 8;
3162     return AArch64::STRXui;
3163   case AArch64::LDURWi:
3164     Scale = 4;
3165     return AArch64::LDRWui;
3166   case AArch64::LDURSWi:
3167     Scale = 4;
3168     return AArch64::LDRSWui;
3169   case AArch64::STURWi:
3170     Scale = 4;
3171     return AArch64::STRWui;
3172   case AArch64::LDURHi:
3173     Scale = 2;
3174     return AArch64::LDRHui;
3175   case AArch64::STURHi:
3176     Scale = 2;
3177     return AArch64::STRHui;
3178   case AArch64::LDURHHi:
3179     Scale = 2;
3180     return AArch64::LDRHHui;
3181   case AArch64::STURHHi:
3182     Scale = 2;
3183     return AArch64::STRHHui;
3184   case AArch64::LDURSHXi:
3185     Scale = 2;
3186     return AArch64::LDRSHXui;
3187   case AArch64::LDURSHWi:
3188     Scale = 2;
3189     return AArch64::LDRSHWui;
3190   case AArch64::LDURBi:
3191     Scale = 1;
3192     return AArch64::LDRBui;
3193   case AArch64::LDURBBi:
3194     Scale = 1;
3195     return AArch64::LDRBBui;
3196   case AArch64::LDURSBXi:
3197     Scale = 1;
3198     return AArch64::LDRSBXui;
3199   case AArch64::LDURSBWi:
3200     Scale = 1;
3201     return AArch64::LDRSBWui;
3202   case AArch64::STURBi:
3203     Scale = 1;
3204     return AArch64::STRBui;
3205   case AArch64::STURBBi:
3206     Scale = 1;
3207     return AArch64::STRBBui;
3208   case AArch64::LDRQui:
3209   case AArch64::STRQui:
3210     Scale = 16;
3211     return Opcode;
3212   case AArch64::LDRDui:
3213   case AArch64::STRDui:
3214   case AArch64::LDRXui:
3215   case AArch64::STRXui:
3216     Scale = 8;
3217     return Opcode;
3218   case AArch64::LDRWui:
3219   case AArch64::LDRSWui:
3220   case AArch64::STRWui:
3221     Scale = 4;
3222     return Opcode;
3223   case AArch64::LDRHui:
3224   case AArch64::STRHui:
3225   case AArch64::LDRHHui:
3226   case AArch64::STRHHui:
3227   case AArch64::LDRSHXui:
3228   case AArch64::LDRSHWui:
3229     Scale = 2;
3230     return Opcode;
3231   case AArch64::LDRBui:
3232   case AArch64::LDRBBui:
3233   case AArch64::LDRSBXui:
3234   case AArch64::LDRSBWui:
3235   case AArch64::STRBui:
3236   case AArch64::STRBBui:
3237     Scale = 1;
3238     return Opcode;
3239   }
3240 }
3241 
3242 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3243 // the opcode of an instruction performing the same operation, but using the
3244 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3245 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3246   switch (Opcode) {
3247   default:
3248     llvm_unreachable("Address folding not implemented for instruction");
3249 
3250   case AArch64::LDURQi:
3251   case AArch64::STURQi:
3252   case AArch64::LDURDi:
3253   case AArch64::STURDi:
3254   case AArch64::LDURXi:
3255   case AArch64::STURXi:
3256   case AArch64::LDURWi:
3257   case AArch64::LDURSWi:
3258   case AArch64::STURWi:
3259   case AArch64::LDURHi:
3260   case AArch64::STURHi:
3261   case AArch64::LDURHHi:
3262   case AArch64::STURHHi:
3263   case AArch64::LDURSHXi:
3264   case AArch64::LDURSHWi:
3265   case AArch64::LDURBi:
3266   case AArch64::STURBi:
3267   case AArch64::LDURBBi:
3268   case AArch64::STURBBi:
3269   case AArch64::LDURSBWi:
3270   case AArch64::LDURSBXi:
3271     return Opcode;
3272   case AArch64::LDRQui:
3273     return AArch64::LDURQi;
3274   case AArch64::STRQui:
3275     return AArch64::STURQi;
3276   case AArch64::LDRDui:
3277     return AArch64::LDURDi;
3278   case AArch64::STRDui:
3279     return AArch64::STURDi;
3280   case AArch64::LDRXui:
3281     return AArch64::LDURXi;
3282   case AArch64::STRXui:
3283     return AArch64::STURXi;
3284   case AArch64::LDRWui:
3285     return AArch64::LDURWi;
3286   case AArch64::LDRSWui:
3287     return AArch64::LDURSWi;
3288   case AArch64::STRWui:
3289     return AArch64::STURWi;
3290   case AArch64::LDRHui:
3291     return AArch64::LDURHi;
3292   case AArch64::STRHui:
3293     return AArch64::STURHi;
3294   case AArch64::LDRHHui:
3295     return AArch64::LDURHHi;
3296   case AArch64::STRHHui:
3297     return AArch64::STURHHi;
3298   case AArch64::LDRSHXui:
3299     return AArch64::LDURSHXi;
3300   case AArch64::LDRSHWui:
3301     return AArch64::LDURSHWi;
3302   case AArch64::LDRBBui:
3303     return AArch64::LDURBBi;
3304   case AArch64::LDRBui:
3305     return AArch64::LDURBi;
3306   case AArch64::STRBBui:
3307     return AArch64::STURBBi;
3308   case AArch64::STRBui:
3309     return AArch64::STURBi;
3310   case AArch64::LDRSBWui:
3311     return AArch64::LDURSBWi;
3312   case AArch64::LDRSBXui:
3313     return AArch64::LDURSBXi;
3314   }
3315 }
3316 
3317 // Given the opcode of a memory load/store instruction, return the opcode of an
3318 // instruction performing the same operation, but using
3319 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3320 // offset register.
offsetExtendOpcode(unsigned Opcode)3321 static unsigned offsetExtendOpcode(unsigned Opcode) {
3322   switch (Opcode) {
3323   default:
3324     llvm_unreachable("Address folding not implemented for instruction");
3325 
3326   case AArch64::LDRQroX:
3327   case AArch64::LDURQi:
3328   case AArch64::LDRQui:
3329     return AArch64::LDRQroW;
3330   case AArch64::STRQroX:
3331   case AArch64::STURQi:
3332   case AArch64::STRQui:
3333     return AArch64::STRQroW;
3334   case AArch64::LDRDroX:
3335   case AArch64::LDURDi:
3336   case AArch64::LDRDui:
3337     return AArch64::LDRDroW;
3338   case AArch64::STRDroX:
3339   case AArch64::STURDi:
3340   case AArch64::STRDui:
3341     return AArch64::STRDroW;
3342   case AArch64::LDRXroX:
3343   case AArch64::LDURXi:
3344   case AArch64::LDRXui:
3345     return AArch64::LDRXroW;
3346   case AArch64::STRXroX:
3347   case AArch64::STURXi:
3348   case AArch64::STRXui:
3349     return AArch64::STRXroW;
3350   case AArch64::LDRWroX:
3351   case AArch64::LDURWi:
3352   case AArch64::LDRWui:
3353     return AArch64::LDRWroW;
3354   case AArch64::LDRSWroX:
3355   case AArch64::LDURSWi:
3356   case AArch64::LDRSWui:
3357     return AArch64::LDRSWroW;
3358   case AArch64::STRWroX:
3359   case AArch64::STURWi:
3360   case AArch64::STRWui:
3361     return AArch64::STRWroW;
3362   case AArch64::LDRHroX:
3363   case AArch64::LDURHi:
3364   case AArch64::LDRHui:
3365     return AArch64::LDRHroW;
3366   case AArch64::STRHroX:
3367   case AArch64::STURHi:
3368   case AArch64::STRHui:
3369     return AArch64::STRHroW;
3370   case AArch64::LDRHHroX:
3371   case AArch64::LDURHHi:
3372   case AArch64::LDRHHui:
3373     return AArch64::LDRHHroW;
3374   case AArch64::STRHHroX:
3375   case AArch64::STURHHi:
3376   case AArch64::STRHHui:
3377     return AArch64::STRHHroW;
3378   case AArch64::LDRSHXroX:
3379   case AArch64::LDURSHXi:
3380   case AArch64::LDRSHXui:
3381     return AArch64::LDRSHXroW;
3382   case AArch64::LDRSHWroX:
3383   case AArch64::LDURSHWi:
3384   case AArch64::LDRSHWui:
3385     return AArch64::LDRSHWroW;
3386   case AArch64::LDRBroX:
3387   case AArch64::LDURBi:
3388   case AArch64::LDRBui:
3389     return AArch64::LDRBroW;
3390   case AArch64::LDRBBroX:
3391   case AArch64::LDURBBi:
3392   case AArch64::LDRBBui:
3393     return AArch64::LDRBBroW;
3394   case AArch64::LDRSBXroX:
3395   case AArch64::LDURSBXi:
3396   case AArch64::LDRSBXui:
3397     return AArch64::LDRSBXroW;
3398   case AArch64::LDRSBWroX:
3399   case AArch64::LDURSBWi:
3400   case AArch64::LDRSBWui:
3401     return AArch64::LDRSBWroW;
3402   case AArch64::STRBroX:
3403   case AArch64::STURBi:
3404   case AArch64::STRBui:
3405     return AArch64::STRBroW;
3406   case AArch64::STRBBroX:
3407   case AArch64::STURBBi:
3408   case AArch64::STRBBui:
3409     return AArch64::STRBBroW;
3410   }
3411 }
3412 
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3413 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3414                                                  const ExtAddrMode &AM) const {
3415 
3416   const DebugLoc &DL = MemI.getDebugLoc();
3417   MachineBasicBlock &MBB = *MemI.getParent();
3418   MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3419 
3420   if (AM.Form == ExtAddrMode::Formula::Basic) {
3421     if (AM.ScaledReg) {
3422       // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3423       unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3424       MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3425       auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3426                    .addReg(MemI.getOperand(0).getReg(),
3427                            MemI.mayLoad() ? RegState::Define : 0)
3428                    .addReg(AM.BaseReg)
3429                    .addReg(AM.ScaledReg)
3430                    .addImm(0)
3431                    .addImm(AM.Scale > 1)
3432                    .setMemRefs(MemI.memoperands())
3433                    .setMIFlags(MemI.getFlags());
3434       return B.getInstr();
3435     }
3436 
3437     assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3438            "Addressing mode not supported for folding");
3439 
3440     // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3441     unsigned Scale = 1;
3442     unsigned Opcode = MemI.getOpcode();
3443     if (isInt<9>(AM.Displacement))
3444       Opcode = unscaledOffsetOpcode(Opcode);
3445     else
3446       Opcode = scaledOffsetOpcode(Opcode, Scale);
3447 
3448     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3449                  .addReg(MemI.getOperand(0).getReg(),
3450                          MemI.mayLoad() ? RegState::Define : 0)
3451                  .addReg(AM.BaseReg)
3452                  .addImm(AM.Displacement / Scale)
3453                  .setMemRefs(MemI.memoperands())
3454                  .setMIFlags(MemI.getFlags());
3455     return B.getInstr();
3456   }
3457 
3458   if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3459       AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3460     // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3461     assert(AM.ScaledReg && !AM.Displacement &&
3462            "Address offset can be a register or an immediate, but not both");
3463     unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3464     MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3465     // Make sure the offset register is in the correct register class.
3466     Register OffsetReg = AM.ScaledReg;
3467     const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3468     if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3469       OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3470       BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3471           .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3472     }
3473     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3474                  .addReg(MemI.getOperand(0).getReg(),
3475                          MemI.mayLoad() ? RegState::Define : 0)
3476                  .addReg(AM.BaseReg)
3477                  .addReg(OffsetReg)
3478                  .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3479                  .addImm(AM.Scale != 1)
3480                  .setMemRefs(MemI.memoperands())
3481                  .setMIFlags(MemI.getFlags());
3482 
3483     return B.getInstr();
3484   }
3485 
3486   llvm_unreachable(
3487       "Function must not be called with an addressing mode it can't handle");
3488 }
3489 
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3490 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3491     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3492     bool &OffsetIsScalable, TypeSize &Width,
3493     const TargetRegisterInfo *TRI) const {
3494   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3495   // Handle only loads/stores with base register followed by immediate offset.
3496   if (LdSt.getNumExplicitOperands() == 3) {
3497     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3498     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3499         !LdSt.getOperand(2).isImm())
3500       return false;
3501   } else if (LdSt.getNumExplicitOperands() == 4) {
3502     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3503     if (!LdSt.getOperand(1).isReg() ||
3504         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3505         !LdSt.getOperand(3).isImm())
3506       return false;
3507   } else
3508     return false;
3509 
3510   // Get the scaling factor for the instruction and set the width for the
3511   // instruction.
3512   TypeSize Scale(0U, false);
3513   int64_t Dummy1, Dummy2;
3514 
3515   // If this returns false, then it's an instruction we don't want to handle.
3516   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3517     return false;
3518 
3519   // Compute the offset. Offset is calculated as the immediate operand
3520   // multiplied by the scaling factor. Unscaled instructions have scaling factor
3521   // set to 1.
3522   if (LdSt.getNumExplicitOperands() == 3) {
3523     BaseOp = &LdSt.getOperand(1);
3524     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3525   } else {
3526     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3527     BaseOp = &LdSt.getOperand(2);
3528     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3529   }
3530   OffsetIsScalable = Scale.isScalable();
3531 
3532   if (!BaseOp->isReg() && !BaseOp->isFI())
3533     return false;
3534 
3535   return true;
3536 }
3537 
3538 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const3539 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3540   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3541   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3542   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3543   return OfsOp;
3544 }
3545 
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)3546 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3547                                     TypeSize &Width, int64_t &MinOffset,
3548                                     int64_t &MaxOffset) {
3549   switch (Opcode) {
3550   // Not a memory operation or something we want to handle.
3551   default:
3552     Scale = TypeSize::getFixed(0);
3553     Width = TypeSize::getFixed(0);
3554     MinOffset = MaxOffset = 0;
3555     return false;
3556   // LDR / STR
3557   case AArch64::LDRQui:
3558   case AArch64::STRQui:
3559     Scale = TypeSize::getFixed(16);
3560     Width = TypeSize::getFixed(16);
3561     MinOffset = 0;
3562     MaxOffset = 4095;
3563     break;
3564   case AArch64::LDRXui:
3565   case AArch64::LDRDui:
3566   case AArch64::STRXui:
3567   case AArch64::STRDui:
3568   case AArch64::PRFMui:
3569     Scale = TypeSize::getFixed(8);
3570     Width = TypeSize::getFixed(8);
3571     MinOffset = 0;
3572     MaxOffset = 4095;
3573     break;
3574   case AArch64::LDRWui:
3575   case AArch64::LDRSui:
3576   case AArch64::LDRSWui:
3577   case AArch64::STRWui:
3578   case AArch64::STRSui:
3579     Scale = TypeSize::getFixed(4);
3580     Width = TypeSize::getFixed(4);
3581     MinOffset = 0;
3582     MaxOffset = 4095;
3583     break;
3584   case AArch64::LDRHui:
3585   case AArch64::LDRHHui:
3586   case AArch64::LDRSHWui:
3587   case AArch64::LDRSHXui:
3588   case AArch64::STRHui:
3589   case AArch64::STRHHui:
3590     Scale = TypeSize::getFixed(2);
3591     Width = TypeSize::getFixed(2);
3592     MinOffset = 0;
3593     MaxOffset = 4095;
3594     break;
3595   case AArch64::LDRBui:
3596   case AArch64::LDRBBui:
3597   case AArch64::LDRSBWui:
3598   case AArch64::LDRSBXui:
3599   case AArch64::STRBui:
3600   case AArch64::STRBBui:
3601     Scale = TypeSize::getFixed(1);
3602     Width = TypeSize::getFixed(1);
3603     MinOffset = 0;
3604     MaxOffset = 4095;
3605     break;
3606   // post/pre inc
3607   case AArch64::STRQpre:
3608   case AArch64::LDRQpost:
3609     Scale = TypeSize::getFixed(1);
3610     Width = TypeSize::getFixed(16);
3611     MinOffset = -256;
3612     MaxOffset = 255;
3613     break;
3614   case AArch64::STRXpre:
3615   case AArch64::STRDpre:
3616   case AArch64::LDRXpost:
3617   case AArch64::LDRDpost:
3618     Scale = TypeSize::getFixed(1);
3619     Width = TypeSize::getFixed(8);
3620     MinOffset = -256;
3621     MaxOffset = 255;
3622     break;
3623   case AArch64::STRWpost:
3624   case AArch64::LDRWpost:
3625     Scale = TypeSize::getFixed(4);
3626     Width = TypeSize::getFixed(32);
3627     MinOffset = -256;
3628     MaxOffset = 255;
3629     break;
3630   // Unscaled
3631   case AArch64::LDURQi:
3632   case AArch64::STURQi:
3633     Scale = TypeSize::getFixed(1);
3634     Width = TypeSize::getFixed(16);
3635     MinOffset = -256;
3636     MaxOffset = 255;
3637     break;
3638   case AArch64::LDURXi:
3639   case AArch64::LDURDi:
3640   case AArch64::LDAPURXi:
3641   case AArch64::STURXi:
3642   case AArch64::STURDi:
3643   case AArch64::STLURXi:
3644   case AArch64::PRFUMi:
3645     Scale = TypeSize::getFixed(1);
3646     Width = TypeSize::getFixed(8);
3647     MinOffset = -256;
3648     MaxOffset = 255;
3649     break;
3650   case AArch64::LDURWi:
3651   case AArch64::LDURSi:
3652   case AArch64::LDURSWi:
3653   case AArch64::LDAPURi:
3654   case AArch64::LDAPURSWi:
3655   case AArch64::STURWi:
3656   case AArch64::STURSi:
3657   case AArch64::STLURWi:
3658     Scale = TypeSize::getFixed(1);
3659     Width = TypeSize::getFixed(4);
3660     MinOffset = -256;
3661     MaxOffset = 255;
3662     break;
3663   case AArch64::LDURHi:
3664   case AArch64::LDURHHi:
3665   case AArch64::LDURSHXi:
3666   case AArch64::LDURSHWi:
3667   case AArch64::LDAPURHi:
3668   case AArch64::LDAPURSHWi:
3669   case AArch64::LDAPURSHXi:
3670   case AArch64::STURHi:
3671   case AArch64::STURHHi:
3672   case AArch64::STLURHi:
3673     Scale = TypeSize::getFixed(1);
3674     Width = TypeSize::getFixed(2);
3675     MinOffset = -256;
3676     MaxOffset = 255;
3677     break;
3678   case AArch64::LDURBi:
3679   case AArch64::LDURBBi:
3680   case AArch64::LDURSBXi:
3681   case AArch64::LDURSBWi:
3682   case AArch64::LDAPURBi:
3683   case AArch64::LDAPURSBWi:
3684   case AArch64::LDAPURSBXi:
3685   case AArch64::STURBi:
3686   case AArch64::STURBBi:
3687   case AArch64::STLURBi:
3688     Scale = TypeSize::getFixed(1);
3689     Width = TypeSize::getFixed(1);
3690     MinOffset = -256;
3691     MaxOffset = 255;
3692     break;
3693   // LDP / STP
3694   case AArch64::LDPQi:
3695   case AArch64::LDNPQi:
3696   case AArch64::STPQi:
3697   case AArch64::STNPQi:
3698     Scale = TypeSize::getFixed(16);
3699     Width = TypeSize::getFixed(32);
3700     MinOffset = -64;
3701     MaxOffset = 63;
3702     break;
3703   case AArch64::LDPXi:
3704   case AArch64::LDPDi:
3705   case AArch64::LDNPXi:
3706   case AArch64::LDNPDi:
3707   case AArch64::STPXi:
3708   case AArch64::STPDi:
3709   case AArch64::STNPXi:
3710   case AArch64::STNPDi:
3711     Scale = TypeSize::getFixed(8);
3712     Width = TypeSize::getFixed(16);
3713     MinOffset = -64;
3714     MaxOffset = 63;
3715     break;
3716   case AArch64::LDPWi:
3717   case AArch64::LDPSi:
3718   case AArch64::LDNPWi:
3719   case AArch64::LDNPSi:
3720   case AArch64::STPWi:
3721   case AArch64::STPSi:
3722   case AArch64::STNPWi:
3723   case AArch64::STNPSi:
3724     Scale = TypeSize::getFixed(4);
3725     Width = TypeSize::getFixed(8);
3726     MinOffset = -64;
3727     MaxOffset = 63;
3728     break;
3729   // pre/post inc
3730   case AArch64::STPQpre:
3731   case AArch64::LDPQpost:
3732     Scale = TypeSize::getFixed(16);
3733     Width = TypeSize::getFixed(16);
3734     MinOffset = -1024;
3735     MaxOffset = 1008;
3736     break;
3737   case AArch64::STPXpre:
3738   case AArch64::LDPXpost:
3739   case AArch64::STPDpre:
3740   case AArch64::LDPDpost:
3741     Scale = TypeSize::getFixed(8);
3742     Width = TypeSize::getFixed(8);
3743     MinOffset = -512;
3744     MaxOffset = 504;
3745     break;
3746   case AArch64::StoreSwiftAsyncContext:
3747     // Store is an STRXui, but there might be an ADDXri in the expansion too.
3748     Scale = TypeSize::getFixed(1);
3749     Width = TypeSize::getFixed(8);
3750     MinOffset = 0;
3751     MaxOffset = 4095;
3752     break;
3753   case AArch64::ADDG:
3754     Scale = TypeSize::getFixed(16);
3755     Width = TypeSize::getFixed(0);
3756     MinOffset = 0;
3757     MaxOffset = 63;
3758     break;
3759   case AArch64::TAGPstack:
3760     Scale = TypeSize::getFixed(16);
3761     Width = TypeSize::getFixed(0);
3762     // TAGP with a negative offset turns into SUBP, which has a maximum offset
3763     // of 63 (not 64!).
3764     MinOffset = -63;
3765     MaxOffset = 63;
3766     break;
3767   case AArch64::LDG:
3768   case AArch64::STGi:
3769   case AArch64::STZGi:
3770     Scale = TypeSize::getFixed(16);
3771     Width = TypeSize::getFixed(16);
3772     MinOffset = -256;
3773     MaxOffset = 255;
3774     break;
3775   // SVE
3776   case AArch64::STR_ZZZZXI:
3777   case AArch64::LDR_ZZZZXI:
3778     Scale = TypeSize::getScalable(16);
3779     Width = TypeSize::getScalable(16 * 4);
3780     MinOffset = -256;
3781     MaxOffset = 252;
3782     break;
3783   case AArch64::STR_ZZZXI:
3784   case AArch64::LDR_ZZZXI:
3785     Scale = TypeSize::getScalable(16);
3786     Width = TypeSize::getScalable(16 * 3);
3787     MinOffset = -256;
3788     MaxOffset = 253;
3789     break;
3790   case AArch64::STR_ZZXI:
3791   case AArch64::LDR_ZZXI:
3792     Scale = TypeSize::getScalable(16);
3793     Width = TypeSize::getScalable(16 * 2);
3794     MinOffset = -256;
3795     MaxOffset = 254;
3796     break;
3797   case AArch64::LDR_PXI:
3798   case AArch64::STR_PXI:
3799     Scale = TypeSize::getScalable(2);
3800     Width = TypeSize::getScalable(2);
3801     MinOffset = -256;
3802     MaxOffset = 255;
3803     break;
3804   case AArch64::LDR_PPXI:
3805   case AArch64::STR_PPXI:
3806     Scale = TypeSize::getScalable(2);
3807     Width = TypeSize::getScalable(2 * 2);
3808     MinOffset = -256;
3809     MaxOffset = 254;
3810     break;
3811   case AArch64::LDR_ZXI:
3812   case AArch64::STR_ZXI:
3813     Scale = TypeSize::getScalable(16);
3814     Width = TypeSize::getScalable(16);
3815     MinOffset = -256;
3816     MaxOffset = 255;
3817     break;
3818   case AArch64::LD1B_IMM:
3819   case AArch64::LD1H_IMM:
3820   case AArch64::LD1W_IMM:
3821   case AArch64::LD1D_IMM:
3822   case AArch64::LDNT1B_ZRI:
3823   case AArch64::LDNT1H_ZRI:
3824   case AArch64::LDNT1W_ZRI:
3825   case AArch64::LDNT1D_ZRI:
3826   case AArch64::ST1B_IMM:
3827   case AArch64::ST1H_IMM:
3828   case AArch64::ST1W_IMM:
3829   case AArch64::ST1D_IMM:
3830   case AArch64::STNT1B_ZRI:
3831   case AArch64::STNT1H_ZRI:
3832   case AArch64::STNT1W_ZRI:
3833   case AArch64::STNT1D_ZRI:
3834   case AArch64::LDNF1B_IMM:
3835   case AArch64::LDNF1H_IMM:
3836   case AArch64::LDNF1W_IMM:
3837   case AArch64::LDNF1D_IMM:
3838     // A full vectors worth of data
3839     // Width = mbytes * elements
3840     Scale = TypeSize::getScalable(16);
3841     Width = TypeSize::getScalable(16);
3842     MinOffset = -8;
3843     MaxOffset = 7;
3844     break;
3845   case AArch64::LD2B_IMM:
3846   case AArch64::LD2H_IMM:
3847   case AArch64::LD2W_IMM:
3848   case AArch64::LD2D_IMM:
3849   case AArch64::ST2B_IMM:
3850   case AArch64::ST2H_IMM:
3851   case AArch64::ST2W_IMM:
3852   case AArch64::ST2D_IMM:
3853     Scale = TypeSize::getScalable(32);
3854     Width = TypeSize::getScalable(16 * 2);
3855     MinOffset = -8;
3856     MaxOffset = 7;
3857     break;
3858   case AArch64::LD3B_IMM:
3859   case AArch64::LD3H_IMM:
3860   case AArch64::LD3W_IMM:
3861   case AArch64::LD3D_IMM:
3862   case AArch64::ST3B_IMM:
3863   case AArch64::ST3H_IMM:
3864   case AArch64::ST3W_IMM:
3865   case AArch64::ST3D_IMM:
3866     Scale = TypeSize::getScalable(48);
3867     Width = TypeSize::getScalable(16 * 3);
3868     MinOffset = -8;
3869     MaxOffset = 7;
3870     break;
3871   case AArch64::LD4B_IMM:
3872   case AArch64::LD4H_IMM:
3873   case AArch64::LD4W_IMM:
3874   case AArch64::LD4D_IMM:
3875   case AArch64::ST4B_IMM:
3876   case AArch64::ST4H_IMM:
3877   case AArch64::ST4W_IMM:
3878   case AArch64::ST4D_IMM:
3879     Scale = TypeSize::getScalable(64);
3880     Width = TypeSize::getScalable(16 * 4);
3881     MinOffset = -8;
3882     MaxOffset = 7;
3883     break;
3884   case AArch64::LD1B_H_IMM:
3885   case AArch64::LD1SB_H_IMM:
3886   case AArch64::LD1H_S_IMM:
3887   case AArch64::LD1SH_S_IMM:
3888   case AArch64::LD1W_D_IMM:
3889   case AArch64::LD1SW_D_IMM:
3890   case AArch64::ST1B_H_IMM:
3891   case AArch64::ST1H_S_IMM:
3892   case AArch64::ST1W_D_IMM:
3893   case AArch64::LDNF1B_H_IMM:
3894   case AArch64::LDNF1SB_H_IMM:
3895   case AArch64::LDNF1H_S_IMM:
3896   case AArch64::LDNF1SH_S_IMM:
3897   case AArch64::LDNF1W_D_IMM:
3898   case AArch64::LDNF1SW_D_IMM:
3899     // A half vector worth of data
3900     // Width = mbytes * elements
3901     Scale = TypeSize::getScalable(8);
3902     Width = TypeSize::getScalable(8);
3903     MinOffset = -8;
3904     MaxOffset = 7;
3905     break;
3906   case AArch64::LD1B_S_IMM:
3907   case AArch64::LD1SB_S_IMM:
3908   case AArch64::LD1H_D_IMM:
3909   case AArch64::LD1SH_D_IMM:
3910   case AArch64::ST1B_S_IMM:
3911   case AArch64::ST1H_D_IMM:
3912   case AArch64::LDNF1B_S_IMM:
3913   case AArch64::LDNF1SB_S_IMM:
3914   case AArch64::LDNF1H_D_IMM:
3915   case AArch64::LDNF1SH_D_IMM:
3916     // A quarter vector worth of data
3917     // Width = mbytes * elements
3918     Scale = TypeSize::getScalable(4);
3919     Width = TypeSize::getScalable(4);
3920     MinOffset = -8;
3921     MaxOffset = 7;
3922     break;
3923   case AArch64::LD1B_D_IMM:
3924   case AArch64::LD1SB_D_IMM:
3925   case AArch64::ST1B_D_IMM:
3926   case AArch64::LDNF1B_D_IMM:
3927   case AArch64::LDNF1SB_D_IMM:
3928     // A eighth vector worth of data
3929     // Width = mbytes * elements
3930     Scale = TypeSize::getScalable(2);
3931     Width = TypeSize::getScalable(2);
3932     MinOffset = -8;
3933     MaxOffset = 7;
3934     break;
3935   case AArch64::ST2Gi:
3936   case AArch64::STZ2Gi:
3937     Scale = TypeSize::getFixed(16);
3938     Width = TypeSize::getFixed(32);
3939     MinOffset = -256;
3940     MaxOffset = 255;
3941     break;
3942   case AArch64::STGPi:
3943     Scale = TypeSize::getFixed(16);
3944     Width = TypeSize::getFixed(16);
3945     MinOffset = -64;
3946     MaxOffset = 63;
3947     break;
3948   case AArch64::LD1RB_IMM:
3949   case AArch64::LD1RB_H_IMM:
3950   case AArch64::LD1RB_S_IMM:
3951   case AArch64::LD1RB_D_IMM:
3952   case AArch64::LD1RSB_H_IMM:
3953   case AArch64::LD1RSB_S_IMM:
3954   case AArch64::LD1RSB_D_IMM:
3955     Scale = TypeSize::getFixed(1);
3956     Width = TypeSize::getFixed(1);
3957     MinOffset = 0;
3958     MaxOffset = 63;
3959     break;
3960   case AArch64::LD1RH_IMM:
3961   case AArch64::LD1RH_S_IMM:
3962   case AArch64::LD1RH_D_IMM:
3963   case AArch64::LD1RSH_S_IMM:
3964   case AArch64::LD1RSH_D_IMM:
3965     Scale = TypeSize::getFixed(2);
3966     Width = TypeSize::getFixed(2);
3967     MinOffset = 0;
3968     MaxOffset = 63;
3969     break;
3970   case AArch64::LD1RW_IMM:
3971   case AArch64::LD1RW_D_IMM:
3972   case AArch64::LD1RSW_IMM:
3973     Scale = TypeSize::getFixed(4);
3974     Width = TypeSize::getFixed(4);
3975     MinOffset = 0;
3976     MaxOffset = 63;
3977     break;
3978   case AArch64::LD1RD_IMM:
3979     Scale = TypeSize::getFixed(8);
3980     Width = TypeSize::getFixed(8);
3981     MinOffset = 0;
3982     MaxOffset = 63;
3983     break;
3984   }
3985 
3986   return true;
3987 }
3988 
3989 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)3990 int AArch64InstrInfo::getMemScale(unsigned Opc) {
3991   switch (Opc) {
3992   default:
3993     llvm_unreachable("Opcode has unknown scale!");
3994   case AArch64::LDRBBui:
3995   case AArch64::LDURBBi:
3996   case AArch64::LDRSBWui:
3997   case AArch64::LDURSBWi:
3998   case AArch64::STRBBui:
3999   case AArch64::STURBBi:
4000     return 1;
4001   case AArch64::LDRHHui:
4002   case AArch64::LDURHHi:
4003   case AArch64::LDRSHWui:
4004   case AArch64::LDURSHWi:
4005   case AArch64::STRHHui:
4006   case AArch64::STURHHi:
4007     return 2;
4008   case AArch64::LDRSui:
4009   case AArch64::LDURSi:
4010   case AArch64::LDRSpre:
4011   case AArch64::LDRSWui:
4012   case AArch64::LDURSWi:
4013   case AArch64::LDRSWpre:
4014   case AArch64::LDRWpre:
4015   case AArch64::LDRWui:
4016   case AArch64::LDURWi:
4017   case AArch64::STRSui:
4018   case AArch64::STURSi:
4019   case AArch64::STRSpre:
4020   case AArch64::STRWui:
4021   case AArch64::STURWi:
4022   case AArch64::STRWpre:
4023   case AArch64::LDPSi:
4024   case AArch64::LDPSWi:
4025   case AArch64::LDPWi:
4026   case AArch64::STPSi:
4027   case AArch64::STPWi:
4028     return 4;
4029   case AArch64::LDRDui:
4030   case AArch64::LDURDi:
4031   case AArch64::LDRDpre:
4032   case AArch64::LDRXui:
4033   case AArch64::LDURXi:
4034   case AArch64::LDRXpre:
4035   case AArch64::STRDui:
4036   case AArch64::STURDi:
4037   case AArch64::STRDpre:
4038   case AArch64::STRXui:
4039   case AArch64::STURXi:
4040   case AArch64::STRXpre:
4041   case AArch64::LDPDi:
4042   case AArch64::LDPXi:
4043   case AArch64::STPDi:
4044   case AArch64::STPXi:
4045     return 8;
4046   case AArch64::LDRQui:
4047   case AArch64::LDURQi:
4048   case AArch64::STRQui:
4049   case AArch64::STURQi:
4050   case AArch64::STRQpre:
4051   case AArch64::LDPQi:
4052   case AArch64::LDRQpre:
4053   case AArch64::STPQi:
4054   case AArch64::STGi:
4055   case AArch64::STZGi:
4056   case AArch64::ST2Gi:
4057   case AArch64::STZ2Gi:
4058   case AArch64::STGPi:
4059     return 16;
4060   }
4061 }
4062 
isPreLd(const MachineInstr & MI)4063 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4064   switch (MI.getOpcode()) {
4065   default:
4066     return false;
4067   case AArch64::LDRWpre:
4068   case AArch64::LDRXpre:
4069   case AArch64::LDRSWpre:
4070   case AArch64::LDRSpre:
4071   case AArch64::LDRDpre:
4072   case AArch64::LDRQpre:
4073     return true;
4074   }
4075 }
4076 
isPreSt(const MachineInstr & MI)4077 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4078   switch (MI.getOpcode()) {
4079   default:
4080     return false;
4081   case AArch64::STRWpre:
4082   case AArch64::STRXpre:
4083   case AArch64::STRSpre:
4084   case AArch64::STRDpre:
4085   case AArch64::STRQpre:
4086     return true;
4087   }
4088 }
4089 
isPreLdSt(const MachineInstr & MI)4090 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4091   return isPreLd(MI) || isPreSt(MI);
4092 }
4093 
isPairedLdSt(const MachineInstr & MI)4094 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4095   switch (MI.getOpcode()) {
4096   default:
4097     return false;
4098   case AArch64::LDPSi:
4099   case AArch64::LDPSWi:
4100   case AArch64::LDPDi:
4101   case AArch64::LDPQi:
4102   case AArch64::LDPWi:
4103   case AArch64::LDPXi:
4104   case AArch64::STPSi:
4105   case AArch64::STPDi:
4106   case AArch64::STPQi:
4107   case AArch64::STPWi:
4108   case AArch64::STPXi:
4109   case AArch64::STGPi:
4110     return true;
4111   }
4112 }
4113 
getLdStBaseOp(const MachineInstr & MI)4114 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4115   unsigned Idx =
4116       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4117                                                                             : 1;
4118   return MI.getOperand(Idx);
4119 }
4120 
4121 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4122 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4123   unsigned Idx =
4124       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4125                                                                             : 2;
4126   return MI.getOperand(Idx);
4127 }
4128 
getRegClass(const MachineInstr & MI,Register Reg)4129 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4130                                               Register Reg) {
4131   if (MI.getParent() == nullptr)
4132     return nullptr;
4133   const MachineFunction *MF = MI.getParent()->getParent();
4134   return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4135 }
4136 
isHForm(const MachineInstr & MI)4137 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4138   auto IsHFPR = [&](const MachineOperand &Op) {
4139     if (!Op.isReg())
4140       return false;
4141     auto Reg = Op.getReg();
4142     if (Reg.isPhysical())
4143       return AArch64::FPR16RegClass.contains(Reg);
4144     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4145     return TRC == &AArch64::FPR16RegClass ||
4146            TRC == &AArch64::FPR16_loRegClass;
4147   };
4148   return llvm::any_of(MI.operands(), IsHFPR);
4149 }
4150 
isQForm(const MachineInstr & MI)4151 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4152   auto IsQFPR = [&](const MachineOperand &Op) {
4153     if (!Op.isReg())
4154       return false;
4155     auto Reg = Op.getReg();
4156     if (Reg.isPhysical())
4157       return AArch64::FPR128RegClass.contains(Reg);
4158     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4159     return TRC == &AArch64::FPR128RegClass ||
4160            TRC == &AArch64::FPR128_loRegClass;
4161   };
4162   return llvm::any_of(MI.operands(), IsQFPR);
4163 }
4164 
hasBTISemantics(const MachineInstr & MI)4165 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4166   switch (MI.getOpcode()) {
4167   case AArch64::BRK:
4168   case AArch64::HLT:
4169   case AArch64::PACIASP:
4170   case AArch64::PACIBSP:
4171     // Implicit BTI behavior.
4172     return true;
4173   case AArch64::PAUTH_PROLOGUE:
4174     // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4175     return true;
4176   case AArch64::HINT: {
4177     unsigned Imm = MI.getOperand(0).getImm();
4178     // Explicit BTI instruction.
4179     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4180       return true;
4181     // PACI(A|B)SP instructions.
4182     if (Imm == 25 || Imm == 27)
4183       return true;
4184     return false;
4185   }
4186   default:
4187     return false;
4188   }
4189 }
4190 
isFpOrNEON(Register Reg)4191 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4192   if (Reg == 0)
4193     return false;
4194   assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4195   return AArch64::FPR128RegClass.contains(Reg) ||
4196          AArch64::FPR64RegClass.contains(Reg) ||
4197          AArch64::FPR32RegClass.contains(Reg) ||
4198          AArch64::FPR16RegClass.contains(Reg) ||
4199          AArch64::FPR8RegClass.contains(Reg);
4200 }
4201 
isFpOrNEON(const MachineInstr & MI)4202 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4203   auto IsFPR = [&](const MachineOperand &Op) {
4204     if (!Op.isReg())
4205       return false;
4206     auto Reg = Op.getReg();
4207     if (Reg.isPhysical())
4208       return isFpOrNEON(Reg);
4209 
4210     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4211     return TRC == &AArch64::FPR128RegClass ||
4212            TRC == &AArch64::FPR128_loRegClass ||
4213            TRC == &AArch64::FPR64RegClass ||
4214            TRC == &AArch64::FPR64_loRegClass ||
4215            TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4216            TRC == &AArch64::FPR8RegClass;
4217   };
4218   return llvm::any_of(MI.operands(), IsFPR);
4219 }
4220 
4221 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4222 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4223 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4224   int Scale = AArch64InstrInfo::getMemScale(Opc);
4225 
4226   // If the byte-offset isn't a multiple of the stride, we can't scale this
4227   // offset.
4228   if (Offset % Scale != 0)
4229     return false;
4230 
4231   // Convert the byte-offset used by unscaled into an "element" offset used
4232   // by the scaled pair load/store instructions.
4233   Offset /= Scale;
4234   return true;
4235 }
4236 
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4237 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4238   if (FirstOpc == SecondOpc)
4239     return true;
4240   // We can also pair sign-ext and zero-ext instructions.
4241   switch (FirstOpc) {
4242   default:
4243     return false;
4244   case AArch64::STRSui:
4245   case AArch64::STURSi:
4246     return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4247   case AArch64::STRDui:
4248   case AArch64::STURDi:
4249     return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4250   case AArch64::STRQui:
4251   case AArch64::STURQi:
4252     return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4253   case AArch64::STRWui:
4254   case AArch64::STURWi:
4255     return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4256   case AArch64::STRXui:
4257   case AArch64::STURXi:
4258     return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4259   case AArch64::LDRSui:
4260   case AArch64::LDURSi:
4261     return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4262   case AArch64::LDRDui:
4263   case AArch64::LDURDi:
4264     return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4265   case AArch64::LDRQui:
4266   case AArch64::LDURQi:
4267     return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4268   case AArch64::LDRWui:
4269   case AArch64::LDURWi:
4270     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4271   case AArch64::LDRSWui:
4272   case AArch64::LDURSWi:
4273     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4274   case AArch64::LDRXui:
4275   case AArch64::LDURXi:
4276     return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4277   }
4278   // These instructions can't be paired based on their opcodes.
4279   return false;
4280 }
4281 
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4282 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4283                             int64_t Offset1, unsigned Opcode1, int FI2,
4284                             int64_t Offset2, unsigned Opcode2) {
4285   // Accesses through fixed stack object frame indices may access a different
4286   // fixed stack slot. Check that the object offsets + offsets match.
4287   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4288     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4289     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4290     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4291     // Convert to scaled object offsets.
4292     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4293     if (ObjectOffset1 % Scale1 != 0)
4294       return false;
4295     ObjectOffset1 /= Scale1;
4296     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4297     if (ObjectOffset2 % Scale2 != 0)
4298       return false;
4299     ObjectOffset2 /= Scale2;
4300     ObjectOffset1 += Offset1;
4301     ObjectOffset2 += Offset2;
4302     return ObjectOffset1 + 1 == ObjectOffset2;
4303   }
4304 
4305   return FI1 == FI2;
4306 }
4307 
4308 /// Detect opportunities for ldp/stp formation.
4309 ///
4310 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4311 bool AArch64InstrInfo::shouldClusterMemOps(
4312     ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4313     bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4314     int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4315     unsigned NumBytes) const {
4316   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4317   const MachineOperand &BaseOp1 = *BaseOps1.front();
4318   const MachineOperand &BaseOp2 = *BaseOps2.front();
4319   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4320   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4321   if (BaseOp1.getType() != BaseOp2.getType())
4322     return false;
4323 
4324   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4325          "Only base registers and frame indices are supported.");
4326 
4327   // Check for both base regs and base FI.
4328   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4329     return false;
4330 
4331   // Only cluster up to a single pair.
4332   if (ClusterSize > 2)
4333     return false;
4334 
4335   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4336     return false;
4337 
4338   // Can we pair these instructions based on their opcodes?
4339   unsigned FirstOpc = FirstLdSt.getOpcode();
4340   unsigned SecondOpc = SecondLdSt.getOpcode();
4341   if (!canPairLdStOpc(FirstOpc, SecondOpc))
4342     return false;
4343 
4344   // Can't merge volatiles or load/stores that have a hint to avoid pair
4345   // formation, for example.
4346   if (!isCandidateToMergeOrPair(FirstLdSt) ||
4347       !isCandidateToMergeOrPair(SecondLdSt))
4348     return false;
4349 
4350   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4351   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4352   if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4353     return false;
4354 
4355   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4356   if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4357     return false;
4358 
4359   // Pairwise instructions have a 7-bit signed offset field.
4360   if (Offset1 > 63 || Offset1 < -64)
4361     return false;
4362 
4363   // The caller should already have ordered First/SecondLdSt by offset.
4364   // Note: except for non-equal frame index bases
4365   if (BaseOp1.isFI()) {
4366     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4367            "Caller should have ordered offsets.");
4368 
4369     const MachineFrameInfo &MFI =
4370         FirstLdSt.getParent()->getParent()->getFrameInfo();
4371     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4372                            BaseOp2.getIndex(), Offset2, SecondOpc);
4373   }
4374 
4375   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4376 
4377   return Offset1 + 1 == Offset2;
4378 }
4379 
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4380 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4381                                             unsigned Reg, unsigned SubIdx,
4382                                             unsigned State,
4383                                             const TargetRegisterInfo *TRI) {
4384   if (!SubIdx)
4385     return MIB.addReg(Reg, State);
4386 
4387   if (Register::isPhysicalRegister(Reg))
4388     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4389   return MIB.addReg(Reg, State, SubIdx);
4390 }
4391 
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4392 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4393                                         unsigned NumRegs) {
4394   // We really want the positive remainder mod 32 here, that happens to be
4395   // easily obtainable with a mask.
4396   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4397 }
4398 
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4399 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4400                                         MachineBasicBlock::iterator I,
4401                                         const DebugLoc &DL, MCRegister DestReg,
4402                                         MCRegister SrcReg, bool KillSrc,
4403                                         unsigned Opcode,
4404                                         ArrayRef<unsigned> Indices) const {
4405   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4406   const TargetRegisterInfo *TRI = &getRegisterInfo();
4407   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4408   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4409   unsigned NumRegs = Indices.size();
4410 
4411   int SubReg = 0, End = NumRegs, Incr = 1;
4412   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4413     SubReg = NumRegs - 1;
4414     End = -1;
4415     Incr = -1;
4416   }
4417 
4418   for (; SubReg != End; SubReg += Incr) {
4419     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4420     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4421     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4422     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4423   }
4424 }
4425 
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const4426 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4427                                        MachineBasicBlock::iterator I,
4428                                        DebugLoc DL, unsigned DestReg,
4429                                        unsigned SrcReg, bool KillSrc,
4430                                        unsigned Opcode, unsigned ZeroReg,
4431                                        llvm::ArrayRef<unsigned> Indices) const {
4432   const TargetRegisterInfo *TRI = &getRegisterInfo();
4433   unsigned NumRegs = Indices.size();
4434 
4435 #ifndef NDEBUG
4436   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4437   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4438   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4439          "GPR reg sequences should not be able to overlap");
4440 #endif
4441 
4442   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4443     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4444     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4445     MIB.addReg(ZeroReg);
4446     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4447     MIB.addImm(0);
4448   }
4449 }
4450 
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const4451 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4452                                    MachineBasicBlock::iterator I,
4453                                    const DebugLoc &DL, MCRegister DestReg,
4454                                    MCRegister SrcReg, bool KillSrc) const {
4455   if (AArch64::GPR32spRegClass.contains(DestReg) &&
4456       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4457     const TargetRegisterInfo *TRI = &getRegisterInfo();
4458 
4459     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4460       // If either operand is WSP, expand to ADD #0.
4461       if (Subtarget.hasZeroCycleRegMove()) {
4462         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4463         MCRegister DestRegX = TRI->getMatchingSuperReg(
4464             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4465         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4466             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4467         // This instruction is reading and writing X registers.  This may upset
4468         // the register scavenger and machine verifier, so we need to indicate
4469         // that we are reading an undefined value from SrcRegX, but a proper
4470         // value from SrcReg.
4471         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4472             .addReg(SrcRegX, RegState::Undef)
4473             .addImm(0)
4474             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4475             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4476       } else {
4477         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4478             .addReg(SrcReg, getKillRegState(KillSrc))
4479             .addImm(0)
4480             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4481       }
4482     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4483       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4484           .addImm(0)
4485           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4486     } else {
4487       if (Subtarget.hasZeroCycleRegMove()) {
4488         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4489         MCRegister DestRegX = TRI->getMatchingSuperReg(
4490             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4491         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4492             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4493         // This instruction is reading and writing X registers.  This may upset
4494         // the register scavenger and machine verifier, so we need to indicate
4495         // that we are reading an undefined value from SrcRegX, but a proper
4496         // value from SrcReg.
4497         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4498             .addReg(AArch64::XZR)
4499             .addReg(SrcRegX, RegState::Undef)
4500             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4501       } else {
4502         // Otherwise, expand to ORR WZR.
4503         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4504             .addReg(AArch64::WZR)
4505             .addReg(SrcReg, getKillRegState(KillSrc));
4506       }
4507     }
4508     return;
4509   }
4510 
4511   // Copy a Predicate register by ORRing with itself.
4512   if (AArch64::PPRRegClass.contains(DestReg) &&
4513       AArch64::PPRRegClass.contains(SrcReg)) {
4514     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4515            "Unexpected SVE register.");
4516     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4517       .addReg(SrcReg) // Pg
4518       .addReg(SrcReg)
4519       .addReg(SrcReg, getKillRegState(KillSrc));
4520     return;
4521   }
4522 
4523   // Copy a predicate-as-counter register by ORRing with itself as if it
4524   // were a regular predicate (mask) register.
4525   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4526   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4527   if (DestIsPNR || SrcIsPNR) {
4528     auto ToPPR = [](MCRegister R) -> MCRegister {
4529       return (R - AArch64::PN0) + AArch64::P0;
4530     };
4531     MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4532     MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4533 
4534     if (PPRSrcReg != PPRDestReg) {
4535       auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4536                        .addReg(PPRSrcReg) // Pg
4537                        .addReg(PPRSrcReg)
4538                        .addReg(PPRSrcReg, getKillRegState(KillSrc));
4539       if (DestIsPNR)
4540         NewMI.addDef(DestReg, RegState::Implicit);
4541     }
4542     return;
4543   }
4544 
4545   // Copy a Z register by ORRing with itself.
4546   if (AArch64::ZPRRegClass.contains(DestReg) &&
4547       AArch64::ZPRRegClass.contains(SrcReg)) {
4548     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4549            "Unexpected SVE register.");
4550     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4551       .addReg(SrcReg)
4552       .addReg(SrcReg, getKillRegState(KillSrc));
4553     return;
4554   }
4555 
4556   // Copy a Z register pair by copying the individual sub-registers.
4557   if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4558        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4559       (AArch64::ZPR2RegClass.contains(SrcReg) ||
4560        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4561     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4562            "Unexpected SVE register.");
4563     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4564     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4565                      Indices);
4566     return;
4567   }
4568 
4569   // Copy a Z register triple by copying the individual sub-registers.
4570   if (AArch64::ZPR3RegClass.contains(DestReg) &&
4571       AArch64::ZPR3RegClass.contains(SrcReg)) {
4572     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4573            "Unexpected SVE register.");
4574     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4575                                        AArch64::zsub2};
4576     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4577                      Indices);
4578     return;
4579   }
4580 
4581   // Copy a Z register quad by copying the individual sub-registers.
4582   if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4583        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4584       (AArch64::ZPR4RegClass.contains(SrcReg) ||
4585        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4586     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4587            "Unexpected SVE register.");
4588     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4589                                        AArch64::zsub2, AArch64::zsub3};
4590     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4591                      Indices);
4592     return;
4593   }
4594 
4595   if (AArch64::GPR64spRegClass.contains(DestReg) &&
4596       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4597     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4598       // If either operand is SP, expand to ADD #0.
4599       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4600           .addReg(SrcReg, getKillRegState(KillSrc))
4601           .addImm(0)
4602           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4603     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4604       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4605           .addImm(0)
4606           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4607     } else {
4608       // Otherwise, expand to ORR XZR.
4609       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4610           .addReg(AArch64::XZR)
4611           .addReg(SrcReg, getKillRegState(KillSrc));
4612     }
4613     return;
4614   }
4615 
4616   // Copy a DDDD register quad by copying the individual sub-registers.
4617   if (AArch64::DDDDRegClass.contains(DestReg) &&
4618       AArch64::DDDDRegClass.contains(SrcReg)) {
4619     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4620                                        AArch64::dsub2, AArch64::dsub3};
4621     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4622                      Indices);
4623     return;
4624   }
4625 
4626   // Copy a DDD register triple by copying the individual sub-registers.
4627   if (AArch64::DDDRegClass.contains(DestReg) &&
4628       AArch64::DDDRegClass.contains(SrcReg)) {
4629     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4630                                        AArch64::dsub2};
4631     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4632                      Indices);
4633     return;
4634   }
4635 
4636   // Copy a DD register pair by copying the individual sub-registers.
4637   if (AArch64::DDRegClass.contains(DestReg) &&
4638       AArch64::DDRegClass.contains(SrcReg)) {
4639     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4640     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4641                      Indices);
4642     return;
4643   }
4644 
4645   // Copy a QQQQ register quad by copying the individual sub-registers.
4646   if (AArch64::QQQQRegClass.contains(DestReg) &&
4647       AArch64::QQQQRegClass.contains(SrcReg)) {
4648     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4649                                        AArch64::qsub2, AArch64::qsub3};
4650     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4651                      Indices);
4652     return;
4653   }
4654 
4655   // Copy a QQQ register triple by copying the individual sub-registers.
4656   if (AArch64::QQQRegClass.contains(DestReg) &&
4657       AArch64::QQQRegClass.contains(SrcReg)) {
4658     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4659                                        AArch64::qsub2};
4660     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4661                      Indices);
4662     return;
4663   }
4664 
4665   // Copy a QQ register pair by copying the individual sub-registers.
4666   if (AArch64::QQRegClass.contains(DestReg) &&
4667       AArch64::QQRegClass.contains(SrcReg)) {
4668     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4669     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4670                      Indices);
4671     return;
4672   }
4673 
4674   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4675       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4676     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4677     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4678                     AArch64::XZR, Indices);
4679     return;
4680   }
4681 
4682   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4683       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4684     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4685     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4686                     AArch64::WZR, Indices);
4687     return;
4688   }
4689 
4690   if (AArch64::FPR128RegClass.contains(DestReg) &&
4691       AArch64::FPR128RegClass.contains(SrcReg)) {
4692     if (Subtarget.isSVEorStreamingSVEAvailable() &&
4693         !Subtarget.isNeonAvailable())
4694       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4695           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4696           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4697           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4698     else if (Subtarget.isNeonAvailable())
4699       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4700           .addReg(SrcReg)
4701           .addReg(SrcReg, getKillRegState(KillSrc));
4702     else {
4703       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4704           .addReg(AArch64::SP, RegState::Define)
4705           .addReg(SrcReg, getKillRegState(KillSrc))
4706           .addReg(AArch64::SP)
4707           .addImm(-16);
4708       BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
4709           .addReg(AArch64::SP, RegState::Define)
4710           .addReg(DestReg, RegState::Define)
4711           .addReg(AArch64::SP)
4712           .addImm(16);
4713     }
4714     return;
4715   }
4716 
4717   if (AArch64::FPR64RegClass.contains(DestReg) &&
4718       AArch64::FPR64RegClass.contains(SrcReg)) {
4719     BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4720         .addReg(SrcReg, getKillRegState(KillSrc));
4721     return;
4722   }
4723 
4724   if (AArch64::FPR32RegClass.contains(DestReg) &&
4725       AArch64::FPR32RegClass.contains(SrcReg)) {
4726     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4727         .addReg(SrcReg, getKillRegState(KillSrc));
4728     return;
4729   }
4730 
4731   if (AArch64::FPR16RegClass.contains(DestReg) &&
4732       AArch64::FPR16RegClass.contains(SrcReg)) {
4733     DestReg =
4734         RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4735     SrcReg =
4736         RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4737     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4738         .addReg(SrcReg, getKillRegState(KillSrc));
4739     return;
4740   }
4741 
4742   if (AArch64::FPR8RegClass.contains(DestReg) &&
4743       AArch64::FPR8RegClass.contains(SrcReg)) {
4744     DestReg =
4745         RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4746     SrcReg =
4747         RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4748     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4749         .addReg(SrcReg, getKillRegState(KillSrc));
4750     return;
4751   }
4752 
4753   // Copies between GPR64 and FPR64.
4754   if (AArch64::FPR64RegClass.contains(DestReg) &&
4755       AArch64::GPR64RegClass.contains(SrcReg)) {
4756     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4757         .addReg(SrcReg, getKillRegState(KillSrc));
4758     return;
4759   }
4760   if (AArch64::GPR64RegClass.contains(DestReg) &&
4761       AArch64::FPR64RegClass.contains(SrcReg)) {
4762     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4763         .addReg(SrcReg, getKillRegState(KillSrc));
4764     return;
4765   }
4766   // Copies between GPR32 and FPR32.
4767   if (AArch64::FPR32RegClass.contains(DestReg) &&
4768       AArch64::GPR32RegClass.contains(SrcReg)) {
4769     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4770         .addReg(SrcReg, getKillRegState(KillSrc));
4771     return;
4772   }
4773   if (AArch64::GPR32RegClass.contains(DestReg) &&
4774       AArch64::FPR32RegClass.contains(SrcReg)) {
4775     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4776         .addReg(SrcReg, getKillRegState(KillSrc));
4777     return;
4778   }
4779 
4780   if (DestReg == AArch64::NZCV) {
4781     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4782     BuildMI(MBB, I, DL, get(AArch64::MSR))
4783         .addImm(AArch64SysReg::NZCV)
4784         .addReg(SrcReg, getKillRegState(KillSrc))
4785         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4786     return;
4787   }
4788 
4789   if (SrcReg == AArch64::NZCV) {
4790     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4791     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4792         .addImm(AArch64SysReg::NZCV)
4793         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4794     return;
4795   }
4796 
4797 #ifndef NDEBUG
4798   const TargetRegisterInfo &TRI = getRegisterInfo();
4799   errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4800          << TRI.getRegAsmName(SrcReg) << "\n";
4801 #endif
4802   llvm_unreachable("unimplemented reg-to-reg copy");
4803 }
4804 
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4805 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4806                                     MachineBasicBlock &MBB,
4807                                     MachineBasicBlock::iterator InsertBefore,
4808                                     const MCInstrDesc &MCID,
4809                                     Register SrcReg, bool IsKill,
4810                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
4811                                     MachineMemOperand *MMO) {
4812   Register SrcReg0 = SrcReg;
4813   Register SrcReg1 = SrcReg;
4814   if (SrcReg.isPhysical()) {
4815     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4816     SubIdx0 = 0;
4817     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4818     SubIdx1 = 0;
4819   }
4820   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4821       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4822       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4823       .addFrameIndex(FI)
4824       .addImm(0)
4825       .addMemOperand(MMO);
4826 }
4827 
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4828 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4829                                            MachineBasicBlock::iterator MBBI,
4830                                            Register SrcReg, bool isKill, int FI,
4831                                            const TargetRegisterClass *RC,
4832                                            const TargetRegisterInfo *TRI,
4833                                            Register VReg) const {
4834   MachineFunction &MF = *MBB.getParent();
4835   MachineFrameInfo &MFI = MF.getFrameInfo();
4836 
4837   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4838   MachineMemOperand *MMO =
4839       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4840                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4841   unsigned Opc = 0;
4842   bool Offset = true;
4843   MCRegister PNRReg = MCRegister::NoRegister;
4844   unsigned StackID = TargetStackID::Default;
4845   switch (TRI->getSpillSize(*RC)) {
4846   case 1:
4847     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4848       Opc = AArch64::STRBui;
4849     break;
4850   case 2: {
4851     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4852       Opc = AArch64::STRHui;
4853     else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
4854              AArch64::PPRRegClass.hasSubClassEq(RC)) {
4855       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4856              "Unexpected register store without SVE store instructions");
4857       Opc = AArch64::STR_PXI;
4858       StackID = TargetStackID::ScalableVector;
4859     }
4860     break;
4861   }
4862   case 4:
4863     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4864       Opc = AArch64::STRWui;
4865       if (SrcReg.isVirtual())
4866         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4867       else
4868         assert(SrcReg != AArch64::WSP);
4869     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4870       Opc = AArch64::STRSui;
4871     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4872       Opc = AArch64::STR_PPXI;
4873       StackID = TargetStackID::ScalableVector;
4874     }
4875     break;
4876   case 8:
4877     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4878       Opc = AArch64::STRXui;
4879       if (SrcReg.isVirtual())
4880         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4881       else
4882         assert(SrcReg != AArch64::SP);
4883     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4884       Opc = AArch64::STRDui;
4885     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4886       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4887                               get(AArch64::STPWi), SrcReg, isKill,
4888                               AArch64::sube32, AArch64::subo32, FI, MMO);
4889       return;
4890     }
4891     break;
4892   case 16:
4893     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4894       Opc = AArch64::STRQui;
4895     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4896       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4897       Opc = AArch64::ST1Twov1d;
4898       Offset = false;
4899     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4900       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4901                               get(AArch64::STPXi), SrcReg, isKill,
4902                               AArch64::sube64, AArch64::subo64, FI, MMO);
4903       return;
4904     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4905       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4906              "Unexpected register store without SVE store instructions");
4907       Opc = AArch64::STR_ZXI;
4908       StackID = TargetStackID::ScalableVector;
4909     }
4910     break;
4911   case 24:
4912     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4913       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4914       Opc = AArch64::ST1Threev1d;
4915       Offset = false;
4916     }
4917     break;
4918   case 32:
4919     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4920       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4921       Opc = AArch64::ST1Fourv1d;
4922       Offset = false;
4923     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4924       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4925       Opc = AArch64::ST1Twov2d;
4926       Offset = false;
4927     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4928                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4929       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4930              "Unexpected register store without SVE store instructions");
4931       Opc = AArch64::STR_ZZXI;
4932       StackID = TargetStackID::ScalableVector;
4933     }
4934     break;
4935   case 48:
4936     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4937       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4938       Opc = AArch64::ST1Threev2d;
4939       Offset = false;
4940     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4941       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4942              "Unexpected register store without SVE store instructions");
4943       Opc = AArch64::STR_ZZZXI;
4944       StackID = TargetStackID::ScalableVector;
4945     }
4946     break;
4947   case 64:
4948     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4949       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4950       Opc = AArch64::ST1Fourv2d;
4951       Offset = false;
4952     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4953                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4954       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4955              "Unexpected register store without SVE store instructions");
4956       Opc = AArch64::STR_ZZZZXI;
4957       StackID = TargetStackID::ScalableVector;
4958     }
4959     break;
4960   }
4961   assert(Opc && "Unknown register class");
4962   MFI.setStackID(FI, StackID);
4963 
4964   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4965                                      .addReg(SrcReg, getKillRegState(isKill))
4966                                      .addFrameIndex(FI);
4967 
4968   if (Offset)
4969     MI.addImm(0);
4970   if (PNRReg.isValid())
4971     MI.addDef(PNRReg, RegState::Implicit);
4972   MI.addMemOperand(MMO);
4973 }
4974 
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4975 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4976                                      MachineBasicBlock &MBB,
4977                                      MachineBasicBlock::iterator InsertBefore,
4978                                      const MCInstrDesc &MCID,
4979                                      Register DestReg, unsigned SubIdx0,
4980                                      unsigned SubIdx1, int FI,
4981                                      MachineMemOperand *MMO) {
4982   Register DestReg0 = DestReg;
4983   Register DestReg1 = DestReg;
4984   bool IsUndef = true;
4985   if (DestReg.isPhysical()) {
4986     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4987     SubIdx0 = 0;
4988     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4989     SubIdx1 = 0;
4990     IsUndef = false;
4991   }
4992   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4993       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4994       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4995       .addFrameIndex(FI)
4996       .addImm(0)
4997       .addMemOperand(MMO);
4998 }
4999 
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const5000 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
5001                                             MachineBasicBlock::iterator MBBI,
5002                                             Register DestReg, int FI,
5003                                             const TargetRegisterClass *RC,
5004                                             const TargetRegisterInfo *TRI,
5005                                             Register VReg) const {
5006   MachineFunction &MF = *MBB.getParent();
5007   MachineFrameInfo &MFI = MF.getFrameInfo();
5008   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5009   MachineMemOperand *MMO =
5010       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5011                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5012 
5013   unsigned Opc = 0;
5014   bool Offset = true;
5015   unsigned StackID = TargetStackID::Default;
5016   Register PNRReg = MCRegister::NoRegister;
5017   switch (TRI->getSpillSize(*RC)) {
5018   case 1:
5019     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5020       Opc = AArch64::LDRBui;
5021     break;
5022   case 2: {
5023     bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5024     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5025       Opc = AArch64::LDRHui;
5026     else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5027       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5028              "Unexpected register load without SVE load instructions");
5029       if (IsPNR)
5030         PNRReg = DestReg;
5031       Opc = AArch64::LDR_PXI;
5032       StackID = TargetStackID::ScalableVector;
5033     }
5034     break;
5035   }
5036   case 4:
5037     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5038       Opc = AArch64::LDRWui;
5039       if (DestReg.isVirtual())
5040         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5041       else
5042         assert(DestReg != AArch64::WSP);
5043     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5044       Opc = AArch64::LDRSui;
5045     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5046       Opc = AArch64::LDR_PPXI;
5047       StackID = TargetStackID::ScalableVector;
5048     }
5049     break;
5050   case 8:
5051     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5052       Opc = AArch64::LDRXui;
5053       if (DestReg.isVirtual())
5054         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5055       else
5056         assert(DestReg != AArch64::SP);
5057     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5058       Opc = AArch64::LDRDui;
5059     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5060       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5061                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
5062                                AArch64::subo32, FI, MMO);
5063       return;
5064     }
5065     break;
5066   case 16:
5067     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5068       Opc = AArch64::LDRQui;
5069     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5070       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071       Opc = AArch64::LD1Twov1d;
5072       Offset = false;
5073     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5074       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5075                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
5076                                AArch64::subo64, FI, MMO);
5077       return;
5078     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5079       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5080              "Unexpected register load without SVE load instructions");
5081       Opc = AArch64::LDR_ZXI;
5082       StackID = TargetStackID::ScalableVector;
5083     }
5084     break;
5085   case 24:
5086     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5087       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5088       Opc = AArch64::LD1Threev1d;
5089       Offset = false;
5090     }
5091     break;
5092   case 32:
5093     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5094       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5095       Opc = AArch64::LD1Fourv1d;
5096       Offset = false;
5097     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5098       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5099       Opc = AArch64::LD1Twov2d;
5100       Offset = false;
5101     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5102                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5103       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104              "Unexpected register load without SVE load instructions");
5105       Opc = AArch64::LDR_ZZXI;
5106       StackID = TargetStackID::ScalableVector;
5107     }
5108     break;
5109   case 48:
5110     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5111       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5112       Opc = AArch64::LD1Threev2d;
5113       Offset = false;
5114     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5115       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5116              "Unexpected register load without SVE load instructions");
5117       Opc = AArch64::LDR_ZZZXI;
5118       StackID = TargetStackID::ScalableVector;
5119     }
5120     break;
5121   case 64:
5122     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5123       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5124       Opc = AArch64::LD1Fourv2d;
5125       Offset = false;
5126     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5127                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5128       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5129              "Unexpected register load without SVE load instructions");
5130       Opc = AArch64::LDR_ZZZZXI;
5131       StackID = TargetStackID::ScalableVector;
5132     }
5133     break;
5134   }
5135 
5136   assert(Opc && "Unknown register class");
5137   MFI.setStackID(FI, StackID);
5138 
5139   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5140                                      .addReg(DestReg, getDefRegState(true))
5141                                      .addFrameIndex(FI);
5142   if (Offset)
5143     MI.addImm(0);
5144   if (PNRReg.isValid() && !PNRReg.isVirtual())
5145     MI.addDef(PNRReg, RegState::Implicit);
5146   MI.addMemOperand(MMO);
5147 }
5148 
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5149 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5150                                            const MachineInstr &UseMI,
5151                                            const TargetRegisterInfo *TRI) {
5152   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5153                                          UseMI.getIterator()),
5154                 [TRI](const MachineInstr &I) {
5155                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
5156                          I.readsRegister(AArch64::NZCV, TRI);
5157                 });
5158 }
5159 
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5160 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5161     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5162   // The smallest scalable element supported by scaled SVE addressing
5163   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5164   // byte offset must always be a multiple of 2.
5165   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5166 
5167   // VGSized offsets are divided by '2', because the VG register is the
5168   // the number of 64bit granules as opposed to 128bit vector chunks,
5169   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5170   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5171   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5172   ByteSized = Offset.getFixed();
5173   VGSized = Offset.getScalable() / 2;
5174 }
5175 
5176 /// Returns the offset in parts to which this frame offset can be
5177 /// decomposed for the purpose of describing a frame offset.
5178 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5179 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5180     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5181     int64_t &NumDataVectors) {
5182   // The smallest scalable element supported by scaled SVE addressing
5183   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5184   // byte offset must always be a multiple of 2.
5185   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5186 
5187   NumBytes = Offset.getFixed();
5188   NumDataVectors = 0;
5189   NumPredicateVectors = Offset.getScalable() / 2;
5190   // This method is used to get the offsets to adjust the frame offset.
5191   // If the function requires ADDPL to be used and needs more than two ADDPL
5192   // instructions, part of the offset is folded into NumDataVectors so that it
5193   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5194   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5195       NumPredicateVectors > 62) {
5196     NumDataVectors = NumPredicateVectors / 8;
5197     NumPredicateVectors -= NumDataVectors * 8;
5198   }
5199 }
5200 
5201 // Convenience function to create a DWARF expression for
5202 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5203 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5204                                      int NumVGScaledBytes, unsigned VG,
5205                                      llvm::raw_string_ostream &Comment) {
5206   uint8_t buffer[16];
5207 
5208   if (NumBytes) {
5209     Expr.push_back(dwarf::DW_OP_consts);
5210     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5211     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5212     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5213   }
5214 
5215   if (NumVGScaledBytes) {
5216     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5217     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5218 
5219     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5220     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5221     Expr.push_back(0);
5222 
5223     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5224     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5225 
5226     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5227             << std::abs(NumVGScaledBytes) << " * VG";
5228   }
5229 }
5230 
5231 // Creates an MCCFIInstruction:
5232 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5233 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5234                                                unsigned Reg,
5235                                                const StackOffset &Offset) {
5236   int64_t NumBytes, NumVGScaledBytes;
5237   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5238                                                         NumVGScaledBytes);
5239   std::string CommentBuffer;
5240   llvm::raw_string_ostream Comment(CommentBuffer);
5241 
5242   if (Reg == AArch64::SP)
5243     Comment << "sp";
5244   else if (Reg == AArch64::FP)
5245     Comment << "fp";
5246   else
5247     Comment << printReg(Reg, &TRI);
5248 
5249   // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5250   SmallString<64> Expr;
5251   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5252   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5253   Expr.push_back(0);
5254   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5255                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5256 
5257   // Wrap this into DW_CFA_def_cfa.
5258   SmallString<64> DefCfaExpr;
5259   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5260   uint8_t buffer[16];
5261   DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5262   DefCfaExpr.append(Expr.str());
5263   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5264                                         Comment.str());
5265 }
5266 
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5267 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5268                                     unsigned FrameReg, unsigned Reg,
5269                                     const StackOffset &Offset,
5270                                     bool LastAdjustmentWasScalable) {
5271   if (Offset.getScalable())
5272     return createDefCFAExpression(TRI, Reg, Offset);
5273 
5274   if (FrameReg == Reg && !LastAdjustmentWasScalable)
5275     return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5276 
5277   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5278   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5279 }
5280 
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5281 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5282                                        unsigned Reg,
5283                                        const StackOffset &OffsetFromDefCFA) {
5284   int64_t NumBytes, NumVGScaledBytes;
5285   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5286       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5287 
5288   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5289 
5290   // Non-scalable offsets can use DW_CFA_offset directly.
5291   if (!NumVGScaledBytes)
5292     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5293 
5294   std::string CommentBuffer;
5295   llvm::raw_string_ostream Comment(CommentBuffer);
5296   Comment << printReg(Reg, &TRI) << "  @ cfa";
5297 
5298   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5299   SmallString<64> OffsetExpr;
5300   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5301                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5302 
5303   // Wrap this into DW_CFA_expression
5304   SmallString<64> CfaExpr;
5305   CfaExpr.push_back(dwarf::DW_CFA_expression);
5306   uint8_t buffer[16];
5307   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5308   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5309   CfaExpr.append(OffsetExpr.str());
5310 
5311   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5312                                         Comment.str());
5313 }
5314 
5315 // Helper function to emit a frame offset adjustment from a given
5316 // pointer (SrcReg), stored into DestReg. This function is explicit
5317 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5318 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5319                                MachineBasicBlock::iterator MBBI,
5320                                const DebugLoc &DL, unsigned DestReg,
5321                                unsigned SrcReg, int64_t Offset, unsigned Opc,
5322                                const TargetInstrInfo *TII,
5323                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5324                                bool *HasWinCFI, bool EmitCFAOffset,
5325                                StackOffset CFAOffset, unsigned FrameReg) {
5326   int Sign = 1;
5327   unsigned MaxEncoding, ShiftSize;
5328   switch (Opc) {
5329   case AArch64::ADDXri:
5330   case AArch64::ADDSXri:
5331   case AArch64::SUBXri:
5332   case AArch64::SUBSXri:
5333     MaxEncoding = 0xfff;
5334     ShiftSize = 12;
5335     break;
5336   case AArch64::ADDVL_XXI:
5337   case AArch64::ADDPL_XXI:
5338   case AArch64::ADDSVL_XXI:
5339   case AArch64::ADDSPL_XXI:
5340     MaxEncoding = 31;
5341     ShiftSize = 0;
5342     if (Offset < 0) {
5343       MaxEncoding = 32;
5344       Sign = -1;
5345       Offset = -Offset;
5346     }
5347     break;
5348   default:
5349     llvm_unreachable("Unsupported opcode");
5350   }
5351 
5352   // `Offset` can be in bytes or in "scalable bytes".
5353   int VScale = 1;
5354   if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5355     VScale = 16;
5356   else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5357     VScale = 2;
5358 
5359   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5360   // scratch register.  If DestReg is a virtual register, use it as the
5361   // scratch register; otherwise, create a new virtual register (to be
5362   // replaced by the scavenger at the end of PEI).  That case can be optimized
5363   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5364   // register can be loaded with offset%8 and the add/sub can use an extending
5365   // instruction with LSL#3.
5366   // Currently the function handles any offsets but generates a poor sequence
5367   // of code.
5368   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5369 
5370   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5371   Register TmpReg = DestReg;
5372   if (TmpReg == AArch64::XZR)
5373     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5374         &AArch64::GPR64RegClass);
5375   do {
5376     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5377     unsigned LocalShiftSize = 0;
5378     if (ThisVal > MaxEncoding) {
5379       ThisVal = ThisVal >> ShiftSize;
5380       LocalShiftSize = ShiftSize;
5381     }
5382     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5383            "Encoding cannot handle value that big");
5384 
5385     Offset -= ThisVal << LocalShiftSize;
5386     if (Offset == 0)
5387       TmpReg = DestReg;
5388     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5389                    .addReg(SrcReg)
5390                    .addImm(Sign * (int)ThisVal);
5391     if (ShiftSize)
5392       MBI = MBI.addImm(
5393           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5394     MBI = MBI.setMIFlag(Flag);
5395 
5396     auto Change =
5397         VScale == 1
5398             ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5399             : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5400     if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5401       CFAOffset += Change;
5402     else
5403       CFAOffset -= Change;
5404     if (EmitCFAOffset && DestReg == TmpReg) {
5405       MachineFunction &MF = *MBB.getParent();
5406       const TargetSubtargetInfo &STI = MF.getSubtarget();
5407       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5408 
5409       unsigned CFIIndex = MF.addFrameInst(
5410           createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5411       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5412           .addCFIIndex(CFIIndex)
5413           .setMIFlags(Flag);
5414     }
5415 
5416     if (NeedsWinCFI) {
5417       assert(Sign == 1 && "SEH directives should always have a positive sign");
5418       int Imm = (int)(ThisVal << LocalShiftSize);
5419       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5420           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5421         if (HasWinCFI)
5422           *HasWinCFI = true;
5423         if (Imm == 0)
5424           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5425         else
5426           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5427               .addImm(Imm)
5428               .setMIFlag(Flag);
5429         assert(Offset == 0 && "Expected remaining offset to be zero to "
5430                               "emit a single SEH directive");
5431       } else if (DestReg == AArch64::SP) {
5432         if (HasWinCFI)
5433           *HasWinCFI = true;
5434         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5435         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5436             .addImm(Imm)
5437             .setMIFlag(Flag);
5438       }
5439     }
5440 
5441     SrcReg = TmpReg;
5442   } while (Offset);
5443 }
5444 
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5445 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5446                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5447                            unsigned DestReg, unsigned SrcReg,
5448                            StackOffset Offset, const TargetInstrInfo *TII,
5449                            MachineInstr::MIFlag Flag, bool SetNZCV,
5450                            bool NeedsWinCFI, bool *HasWinCFI,
5451                            bool EmitCFAOffset, StackOffset CFAOffset,
5452                            unsigned FrameReg) {
5453   // If a function is marked as arm_locally_streaming, then the runtime value of
5454   // vscale in the prologue/epilogue is different the runtime value of vscale
5455   // in the function's body. To avoid having to consider multiple vscales,
5456   // we can use `addsvl` to allocate any scalable stack-slots, which under
5457   // most circumstances will be only locals, not callee-save slots.
5458   const Function &F = MBB.getParent()->getFunction();
5459   bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5460 
5461   int64_t Bytes, NumPredicateVectors, NumDataVectors;
5462   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5463       Offset, Bytes, NumPredicateVectors, NumDataVectors);
5464 
5465   // First emit non-scalable frame offsets, or a simple 'mov'.
5466   if (Bytes || (!Offset && SrcReg != DestReg)) {
5467     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5468            "SP increment/decrement not 8-byte aligned");
5469     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5470     if (Bytes < 0) {
5471       Bytes = -Bytes;
5472       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5473     }
5474     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5475                        NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5476                        FrameReg);
5477     CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5478                      ? StackOffset::getFixed(-Bytes)
5479                      : StackOffset::getFixed(Bytes);
5480     SrcReg = DestReg;
5481     FrameReg = DestReg;
5482   }
5483 
5484   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5485          "SetNZCV not supported with SVE vectors");
5486   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5487          "WinCFI not supported with SVE vectors");
5488 
5489   if (NumDataVectors) {
5490     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5491                        UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5492                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5493                        CFAOffset, FrameReg);
5494     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5495     SrcReg = DestReg;
5496   }
5497 
5498   if (NumPredicateVectors) {
5499     assert(DestReg != AArch64::SP && "Unaligned access to SP");
5500     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5501                        UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5502                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5503                        CFAOffset, FrameReg);
5504   }
5505 }
5506 
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const5507 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5508     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5509     MachineBasicBlock::iterator InsertPt, int FrameIndex,
5510     LiveIntervals *LIS, VirtRegMap *VRM) const {
5511   // This is a bit of a hack. Consider this instruction:
5512   //
5513   //   %0 = COPY %sp; GPR64all:%0
5514   //
5515   // We explicitly chose GPR64all for the virtual register so such a copy might
5516   // be eliminated by RegisterCoalescer. However, that may not be possible, and
5517   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5518   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5519   //
5520   // To prevent that, we are going to constrain the %0 register class here.
5521   if (MI.isFullCopy()) {
5522     Register DstReg = MI.getOperand(0).getReg();
5523     Register SrcReg = MI.getOperand(1).getReg();
5524     if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5525       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5526       return nullptr;
5527     }
5528     if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5529       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5530       return nullptr;
5531     }
5532     // Nothing can folded with copy from/to NZCV.
5533     if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5534       return nullptr;
5535   }
5536 
5537   // Handle the case where a copy is being spilled or filled but the source
5538   // and destination register class don't match.  For example:
5539   //
5540   //   %0 = COPY %xzr; GPR64common:%0
5541   //
5542   // In this case we can still safely fold away the COPY and generate the
5543   // following spill code:
5544   //
5545   //   STRXui %xzr, %stack.0
5546   //
5547   // This also eliminates spilled cross register class COPYs (e.g. between x and
5548   // d regs) of the same size.  For example:
5549   //
5550   //   %0 = COPY %1; GPR64:%0, FPR64:%1
5551   //
5552   // will be filled as
5553   //
5554   //   LDRDui %0, fi<#0>
5555   //
5556   // instead of
5557   //
5558   //   LDRXui %Temp, fi<#0>
5559   //   %0 = FMOV %Temp
5560   //
5561   if (MI.isCopy() && Ops.size() == 1 &&
5562       // Make sure we're only folding the explicit COPY defs/uses.
5563       (Ops[0] == 0 || Ops[0] == 1)) {
5564     bool IsSpill = Ops[0] == 0;
5565     bool IsFill = !IsSpill;
5566     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5567     const MachineRegisterInfo &MRI = MF.getRegInfo();
5568     MachineBasicBlock &MBB = *MI.getParent();
5569     const MachineOperand &DstMO = MI.getOperand(0);
5570     const MachineOperand &SrcMO = MI.getOperand(1);
5571     Register DstReg = DstMO.getReg();
5572     Register SrcReg = SrcMO.getReg();
5573     // This is slightly expensive to compute for physical regs since
5574     // getMinimalPhysRegClass is slow.
5575     auto getRegClass = [&](unsigned Reg) {
5576       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5577                                               : TRI.getMinimalPhysRegClass(Reg);
5578     };
5579 
5580     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5581       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5582                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5583              "Mismatched register size in non subreg COPY");
5584       if (IsSpill)
5585         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5586                             getRegClass(SrcReg), &TRI, Register());
5587       else
5588         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5589                              getRegClass(DstReg), &TRI, Register());
5590       return &*--InsertPt;
5591     }
5592 
5593     // Handle cases like spilling def of:
5594     //
5595     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5596     //
5597     // where the physical register source can be widened and stored to the full
5598     // virtual reg destination stack slot, in this case producing:
5599     //
5600     //   STRXui %xzr, %stack.0
5601     //
5602     if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5603         TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5604       assert(SrcMO.getSubReg() == 0 &&
5605              "Unexpected subreg on physical register");
5606       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5607                           FrameIndex, &AArch64::GPR64RegClass, &TRI,
5608                           Register());
5609       return &*--InsertPt;
5610     }
5611 
5612     // Handle cases like filling use of:
5613     //
5614     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5615     //
5616     // where we can load the full virtual reg source stack slot, into the subreg
5617     // destination, in this case producing:
5618     //
5619     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
5620     //
5621     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5622       const TargetRegisterClass *FillRC;
5623       switch (DstMO.getSubReg()) {
5624       default:
5625         FillRC = nullptr;
5626         break;
5627       case AArch64::sub_32:
5628         FillRC = &AArch64::GPR32RegClass;
5629         break;
5630       case AArch64::ssub:
5631         FillRC = &AArch64::FPR32RegClass;
5632         break;
5633       case AArch64::dsub:
5634         FillRC = &AArch64::FPR64RegClass;
5635         break;
5636       }
5637 
5638       if (FillRC) {
5639         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5640                    TRI.getRegSizeInBits(*FillRC) &&
5641                "Mismatched regclass size on folded subreg COPY");
5642         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5643                              Register());
5644         MachineInstr &LoadMI = *--InsertPt;
5645         MachineOperand &LoadDst = LoadMI.getOperand(0);
5646         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5647         LoadDst.setSubReg(DstMO.getSubReg());
5648         LoadDst.setIsUndef();
5649         return &LoadMI;
5650       }
5651     }
5652   }
5653 
5654   // Cannot fold.
5655   return nullptr;
5656 }
5657 
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)5658 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5659                                     StackOffset &SOffset,
5660                                     bool *OutUseUnscaledOp,
5661                                     unsigned *OutUnscaledOp,
5662                                     int64_t *EmittableOffset) {
5663   // Set output values in case of early exit.
5664   if (EmittableOffset)
5665     *EmittableOffset = 0;
5666   if (OutUseUnscaledOp)
5667     *OutUseUnscaledOp = false;
5668   if (OutUnscaledOp)
5669     *OutUnscaledOp = 0;
5670 
5671   // Exit early for structured vector spills/fills as they can't take an
5672   // immediate offset.
5673   switch (MI.getOpcode()) {
5674   default:
5675     break;
5676   case AArch64::LD1Rv1d:
5677   case AArch64::LD1Rv2s:
5678   case AArch64::LD1Rv2d:
5679   case AArch64::LD1Rv4h:
5680   case AArch64::LD1Rv4s:
5681   case AArch64::LD1Rv8b:
5682   case AArch64::LD1Rv8h:
5683   case AArch64::LD1Rv16b:
5684   case AArch64::LD1Twov2d:
5685   case AArch64::LD1Threev2d:
5686   case AArch64::LD1Fourv2d:
5687   case AArch64::LD1Twov1d:
5688   case AArch64::LD1Threev1d:
5689   case AArch64::LD1Fourv1d:
5690   case AArch64::ST1Twov2d:
5691   case AArch64::ST1Threev2d:
5692   case AArch64::ST1Fourv2d:
5693   case AArch64::ST1Twov1d:
5694   case AArch64::ST1Threev1d:
5695   case AArch64::ST1Fourv1d:
5696   case AArch64::ST1i8:
5697   case AArch64::ST1i16:
5698   case AArch64::ST1i32:
5699   case AArch64::ST1i64:
5700   case AArch64::IRG:
5701   case AArch64::IRGstack:
5702   case AArch64::STGloop:
5703   case AArch64::STZGloop:
5704     return AArch64FrameOffsetCannotUpdate;
5705   }
5706 
5707   // Get the min/max offset and the scale.
5708   TypeSize ScaleValue(0U, false), Width(0U, false);
5709   int64_t MinOff, MaxOff;
5710   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5711                                       MaxOff))
5712     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5713 
5714   // Construct the complete offset.
5715   bool IsMulVL = ScaleValue.isScalable();
5716   unsigned Scale = ScaleValue.getKnownMinValue();
5717   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5718 
5719   const MachineOperand &ImmOpnd =
5720       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5721   Offset += ImmOpnd.getImm() * Scale;
5722 
5723   // If the offset doesn't match the scale, we rewrite the instruction to
5724   // use the unscaled instruction instead. Likewise, if we have a negative
5725   // offset and there is an unscaled op to use.
5726   std::optional<unsigned> UnscaledOp =
5727       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5728   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5729   if (useUnscaledOp &&
5730       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5731                                       MaxOff))
5732     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5733 
5734   Scale = ScaleValue.getKnownMinValue();
5735   assert(IsMulVL == ScaleValue.isScalable() &&
5736          "Unscaled opcode has different value for scalable");
5737 
5738   int64_t Remainder = Offset % Scale;
5739   assert(!(Remainder && useUnscaledOp) &&
5740          "Cannot have remainder when using unscaled op");
5741 
5742   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5743   int64_t NewOffset = Offset / Scale;
5744   if (MinOff <= NewOffset && NewOffset <= MaxOff)
5745     Offset = Remainder;
5746   else {
5747     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5748     Offset = Offset - (NewOffset * Scale);
5749   }
5750 
5751   if (EmittableOffset)
5752     *EmittableOffset = NewOffset;
5753   if (OutUseUnscaledOp)
5754     *OutUseUnscaledOp = useUnscaledOp;
5755   if (OutUnscaledOp && UnscaledOp)
5756     *OutUnscaledOp = *UnscaledOp;
5757 
5758   if (IsMulVL)
5759     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5760   else
5761     SOffset = StackOffset::get(Offset, SOffset.getScalable());
5762   return AArch64FrameOffsetCanUpdate |
5763          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5764 }
5765 
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)5766 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5767                                     unsigned FrameReg, StackOffset &Offset,
5768                                     const AArch64InstrInfo *TII) {
5769   unsigned Opcode = MI.getOpcode();
5770   unsigned ImmIdx = FrameRegIdx + 1;
5771 
5772   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5773     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5774     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5775                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5776                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5777     MI.eraseFromParent();
5778     Offset = StackOffset();
5779     return true;
5780   }
5781 
5782   int64_t NewOffset;
5783   unsigned UnscaledOp;
5784   bool UseUnscaledOp;
5785   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5786                                          &UnscaledOp, &NewOffset);
5787   if (Status & AArch64FrameOffsetCanUpdate) {
5788     if (Status & AArch64FrameOffsetIsLegal)
5789       // Replace the FrameIndex with FrameReg.
5790       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5791     if (UseUnscaledOp)
5792       MI.setDesc(TII->get(UnscaledOp));
5793 
5794     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5795     return !Offset;
5796   }
5797 
5798   return false;
5799 }
5800 
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const5801 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5802                                   MachineBasicBlock::iterator MI) const {
5803   DebugLoc DL;
5804   BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5805 }
5806 
getNop() const5807 MCInst AArch64InstrInfo::getNop() const {
5808   return MCInstBuilder(AArch64::HINT).addImm(0);
5809 }
5810 
5811 // AArch64 supports MachineCombiner.
useMachineCombiner() const5812 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5813 
5814 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)5815 static bool isCombineInstrSettingFlag(unsigned Opc) {
5816   switch (Opc) {
5817   case AArch64::ADDSWrr:
5818   case AArch64::ADDSWri:
5819   case AArch64::ADDSXrr:
5820   case AArch64::ADDSXri:
5821   case AArch64::SUBSWrr:
5822   case AArch64::SUBSXrr:
5823   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5824   case AArch64::SUBSWri:
5825   case AArch64::SUBSXri:
5826     return true;
5827   default:
5828     break;
5829   }
5830   return false;
5831 }
5832 
5833 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)5834 static bool isCombineInstrCandidate32(unsigned Opc) {
5835   switch (Opc) {
5836   case AArch64::ADDWrr:
5837   case AArch64::ADDWri:
5838   case AArch64::SUBWrr:
5839   case AArch64::ADDSWrr:
5840   case AArch64::ADDSWri:
5841   case AArch64::SUBSWrr:
5842   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5843   case AArch64::SUBWri:
5844   case AArch64::SUBSWri:
5845     return true;
5846   default:
5847     break;
5848   }
5849   return false;
5850 }
5851 
5852 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)5853 static bool isCombineInstrCandidate64(unsigned Opc) {
5854   switch (Opc) {
5855   case AArch64::ADDXrr:
5856   case AArch64::ADDXri:
5857   case AArch64::SUBXrr:
5858   case AArch64::ADDSXrr:
5859   case AArch64::ADDSXri:
5860   case AArch64::SUBSXrr:
5861   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5862   case AArch64::SUBXri:
5863   case AArch64::SUBSXri:
5864   case AArch64::ADDv8i8:
5865   case AArch64::ADDv16i8:
5866   case AArch64::ADDv4i16:
5867   case AArch64::ADDv8i16:
5868   case AArch64::ADDv2i32:
5869   case AArch64::ADDv4i32:
5870   case AArch64::SUBv8i8:
5871   case AArch64::SUBv16i8:
5872   case AArch64::SUBv4i16:
5873   case AArch64::SUBv8i16:
5874   case AArch64::SUBv2i32:
5875   case AArch64::SUBv4i32:
5876     return true;
5877   default:
5878     break;
5879   }
5880   return false;
5881 }
5882 
5883 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)5884 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5885   switch (Inst.getOpcode()) {
5886   default:
5887     break;
5888   case AArch64::FADDHrr:
5889   case AArch64::FADDSrr:
5890   case AArch64::FADDDrr:
5891   case AArch64::FADDv4f16:
5892   case AArch64::FADDv8f16:
5893   case AArch64::FADDv2f32:
5894   case AArch64::FADDv2f64:
5895   case AArch64::FADDv4f32:
5896   case AArch64::FSUBHrr:
5897   case AArch64::FSUBSrr:
5898   case AArch64::FSUBDrr:
5899   case AArch64::FSUBv4f16:
5900   case AArch64::FSUBv8f16:
5901   case AArch64::FSUBv2f32:
5902   case AArch64::FSUBv2f64:
5903   case AArch64::FSUBv4f32:
5904     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5905     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5906     // the target options or if FADD/FSUB has the contract fast-math flag.
5907     return Options.UnsafeFPMath ||
5908            Options.AllowFPOpFusion == FPOpFusion::Fast ||
5909            Inst.getFlag(MachineInstr::FmContract);
5910     return true;
5911   }
5912   return false;
5913 }
5914 
5915 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)5916 static bool isCombineInstrCandidate(unsigned Opc) {
5917   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5918 }
5919 
5920 //
5921 // Utility routine that checks if \param MO is defined by an
5922 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)5923 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5924                        unsigned CombineOpc, unsigned ZeroReg = 0,
5925                        bool CheckZeroReg = false) {
5926   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5927   MachineInstr *MI = nullptr;
5928 
5929   if (MO.isReg() && MO.getReg().isVirtual())
5930     MI = MRI.getUniqueVRegDef(MO.getReg());
5931   // And it needs to be in the trace (otherwise, it won't have a depth).
5932   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5933     return false;
5934   // Must only used by the user we combine with.
5935   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5936     return false;
5937 
5938   if (CheckZeroReg) {
5939     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5940            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5941            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5942     // The third input reg must be zero.
5943     if (MI->getOperand(3).getReg() != ZeroReg)
5944       return false;
5945   }
5946 
5947   if (isCombineInstrSettingFlag(CombineOpc) &&
5948       MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
5949     return false;
5950 
5951   return true;
5952 }
5953 
5954 //
5955 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)5956 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5957                               unsigned MulOpc, unsigned ZeroReg) {
5958   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5959 }
5960 
5961 //
5962 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)5963 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5964                                unsigned MulOpc) {
5965   return canCombine(MBB, MO, MulOpc);
5966 }
5967 
5968 // TODO: There are many more machine instruction opcodes to match:
5969 //       1. Other data types (integer, vectors)
5970 //       2. Other math / logic operations (xor, or)
5971 //       3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const5972 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5973                                                    bool Invert) const {
5974   if (Invert)
5975     return false;
5976   switch (Inst.getOpcode()) {
5977   // == Floating-point types ==
5978   // -- Floating-point instructions --
5979   case AArch64::FADDHrr:
5980   case AArch64::FADDSrr:
5981   case AArch64::FADDDrr:
5982   case AArch64::FMULHrr:
5983   case AArch64::FMULSrr:
5984   case AArch64::FMULDrr:
5985   case AArch64::FMULX16:
5986   case AArch64::FMULX32:
5987   case AArch64::FMULX64:
5988   // -- Advanced SIMD instructions --
5989   case AArch64::FADDv4f16:
5990   case AArch64::FADDv8f16:
5991   case AArch64::FADDv2f32:
5992   case AArch64::FADDv4f32:
5993   case AArch64::FADDv2f64:
5994   case AArch64::FMULv4f16:
5995   case AArch64::FMULv8f16:
5996   case AArch64::FMULv2f32:
5997   case AArch64::FMULv4f32:
5998   case AArch64::FMULv2f64:
5999   case AArch64::FMULXv4f16:
6000   case AArch64::FMULXv8f16:
6001   case AArch64::FMULXv2f32:
6002   case AArch64::FMULXv4f32:
6003   case AArch64::FMULXv2f64:
6004   // -- SVE instructions --
6005   // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6006   // in the SVE instruction set (though there are predicated ones).
6007   case AArch64::FADD_ZZZ_H:
6008   case AArch64::FADD_ZZZ_S:
6009   case AArch64::FADD_ZZZ_D:
6010   case AArch64::FMUL_ZZZ_H:
6011   case AArch64::FMUL_ZZZ_S:
6012   case AArch64::FMUL_ZZZ_D:
6013     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6014            (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6015             Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6016 
6017   // == Integer types ==
6018   // -- Base instructions --
6019   // Opcodes MULWrr and MULXrr don't exist because
6020   // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6021   // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6022   // The machine-combiner does not support three-source-operands machine
6023   // instruction. So we cannot reassociate MULs.
6024   case AArch64::ADDWrr:
6025   case AArch64::ADDXrr:
6026   case AArch64::ANDWrr:
6027   case AArch64::ANDXrr:
6028   case AArch64::ORRWrr:
6029   case AArch64::ORRXrr:
6030   case AArch64::EORWrr:
6031   case AArch64::EORXrr:
6032   case AArch64::EONWrr:
6033   case AArch64::EONXrr:
6034   // -- Advanced SIMD instructions --
6035   // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6036   // in the Advanced SIMD instruction set.
6037   case AArch64::ADDv8i8:
6038   case AArch64::ADDv16i8:
6039   case AArch64::ADDv4i16:
6040   case AArch64::ADDv8i16:
6041   case AArch64::ADDv2i32:
6042   case AArch64::ADDv4i32:
6043   case AArch64::ADDv1i64:
6044   case AArch64::ADDv2i64:
6045   case AArch64::MULv8i8:
6046   case AArch64::MULv16i8:
6047   case AArch64::MULv4i16:
6048   case AArch64::MULv8i16:
6049   case AArch64::MULv2i32:
6050   case AArch64::MULv4i32:
6051   case AArch64::ANDv8i8:
6052   case AArch64::ANDv16i8:
6053   case AArch64::ORRv8i8:
6054   case AArch64::ORRv16i8:
6055   case AArch64::EORv8i8:
6056   case AArch64::EORv16i8:
6057   // -- SVE instructions --
6058   case AArch64::ADD_ZZZ_B:
6059   case AArch64::ADD_ZZZ_H:
6060   case AArch64::ADD_ZZZ_S:
6061   case AArch64::ADD_ZZZ_D:
6062   case AArch64::MUL_ZZZ_B:
6063   case AArch64::MUL_ZZZ_H:
6064   case AArch64::MUL_ZZZ_S:
6065   case AArch64::MUL_ZZZ_D:
6066   case AArch64::AND_ZZZ:
6067   case AArch64::ORR_ZZZ:
6068   case AArch64::EOR_ZZZ:
6069     return true;
6070 
6071   default:
6072     return false;
6073   }
6074 }
6075 
6076 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6077 static bool getMaddPatterns(MachineInstr &Root,
6078                             SmallVectorImpl<unsigned> &Patterns) {
6079   unsigned Opc = Root.getOpcode();
6080   MachineBasicBlock &MBB = *Root.getParent();
6081   bool Found = false;
6082 
6083   if (!isCombineInstrCandidate(Opc))
6084     return false;
6085   if (isCombineInstrSettingFlag(Opc)) {
6086     int Cmp_NZCV =
6087         Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6088     // When NZCV is live bail out.
6089     if (Cmp_NZCV == -1)
6090       return false;
6091     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6092     // When opcode can't change bail out.
6093     // CHECKME: do we miss any cases for opcode conversion?
6094     if (NewOpc == Opc)
6095       return false;
6096     Opc = NewOpc;
6097   }
6098 
6099   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6100                       unsigned Pattern) {
6101     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6102       Patterns.push_back(Pattern);
6103       Found = true;
6104     }
6105   };
6106 
6107   auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6108     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6109       Patterns.push_back(Pattern);
6110       Found = true;
6111     }
6112   };
6113 
6114   typedef AArch64MachineCombinerPattern MCP;
6115 
6116   switch (Opc) {
6117   default:
6118     break;
6119   case AArch64::ADDWrr:
6120     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6121            "ADDWrr does not have register operands");
6122     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6123     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6124     break;
6125   case AArch64::ADDXrr:
6126     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6127     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6128     break;
6129   case AArch64::SUBWrr:
6130     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6131     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6132     break;
6133   case AArch64::SUBXrr:
6134     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6135     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6136     break;
6137   case AArch64::ADDWri:
6138     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6139     break;
6140   case AArch64::ADDXri:
6141     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6142     break;
6143   case AArch64::SUBWri:
6144     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6145     break;
6146   case AArch64::SUBXri:
6147     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6148     break;
6149   case AArch64::ADDv8i8:
6150     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6151     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6152     break;
6153   case AArch64::ADDv16i8:
6154     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6155     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6156     break;
6157   case AArch64::ADDv4i16:
6158     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6159     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6160     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6161     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6162     break;
6163   case AArch64::ADDv8i16:
6164     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6165     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6166     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6167     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6168     break;
6169   case AArch64::ADDv2i32:
6170     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6171     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6172     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6173     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6174     break;
6175   case AArch64::ADDv4i32:
6176     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6177     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6178     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6179     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6180     break;
6181   case AArch64::SUBv8i8:
6182     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6183     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6184     break;
6185   case AArch64::SUBv16i8:
6186     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6187     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6188     break;
6189   case AArch64::SUBv4i16:
6190     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6191     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6192     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6193     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6194     break;
6195   case AArch64::SUBv8i16:
6196     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6197     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6198     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6199     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6200     break;
6201   case AArch64::SUBv2i32:
6202     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6203     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6204     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6205     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6206     break;
6207   case AArch64::SUBv4i32:
6208     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6209     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6210     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6211     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6212     break;
6213   }
6214   return Found;
6215 }
6216 /// Floating-Point Support
6217 
6218 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6219 static bool getFMAPatterns(MachineInstr &Root,
6220                            SmallVectorImpl<unsigned> &Patterns) {
6221 
6222   if (!isCombineInstrCandidateFP(Root))
6223     return false;
6224 
6225   MachineBasicBlock &MBB = *Root.getParent();
6226   bool Found = false;
6227 
6228   auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6229     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6230       Patterns.push_back(Pattern);
6231       return true;
6232     }
6233     return false;
6234   };
6235 
6236   typedef AArch64MachineCombinerPattern MCP;
6237 
6238   switch (Root.getOpcode()) {
6239   default:
6240     assert(false && "Unsupported FP instruction in combiner\n");
6241     break;
6242   case AArch64::FADDHrr:
6243     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6244            "FADDHrr does not have register operands");
6245 
6246     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6247     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6248     break;
6249   case AArch64::FADDSrr:
6250     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6251            "FADDSrr does not have register operands");
6252 
6253     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6254              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6255 
6256     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6257              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6258     break;
6259   case AArch64::FADDDrr:
6260     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6261              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6262 
6263     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6264              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6265     break;
6266   case AArch64::FADDv4f16:
6267     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6268              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6269 
6270     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6271              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6272     break;
6273   case AArch64::FADDv8f16:
6274     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6275              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6276 
6277     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6278              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6279     break;
6280   case AArch64::FADDv2f32:
6281     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6282              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6283 
6284     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6285              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6286     break;
6287   case AArch64::FADDv2f64:
6288     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6289              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6290 
6291     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6292              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6293     break;
6294   case AArch64::FADDv4f32:
6295     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6296              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6297 
6298     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6299              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6300     break;
6301   case AArch64::FSUBHrr:
6302     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6303     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6304     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6305     break;
6306   case AArch64::FSUBSrr:
6307     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6308 
6309     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6310              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6311 
6312     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6313     break;
6314   case AArch64::FSUBDrr:
6315     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6316 
6317     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6318              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6319 
6320     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6321     break;
6322   case AArch64::FSUBv4f16:
6323     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6324              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6325 
6326     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6327              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6328     break;
6329   case AArch64::FSUBv8f16:
6330     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6331              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6332 
6333     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6334              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6335     break;
6336   case AArch64::FSUBv2f32:
6337     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6338              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6339 
6340     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6341              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6342     break;
6343   case AArch64::FSUBv2f64:
6344     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6345              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6346 
6347     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6348              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6349     break;
6350   case AArch64::FSUBv4f32:
6351     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6352              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6353 
6354     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6355              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6356     break;
6357   }
6358   return Found;
6359 }
6360 
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6361 static bool getFMULPatterns(MachineInstr &Root,
6362                             SmallVectorImpl<unsigned> &Patterns) {
6363   MachineBasicBlock &MBB = *Root.getParent();
6364   bool Found = false;
6365 
6366   auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6367     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6368     MachineOperand &MO = Root.getOperand(Operand);
6369     MachineInstr *MI = nullptr;
6370     if (MO.isReg() && MO.getReg().isVirtual())
6371       MI = MRI.getUniqueVRegDef(MO.getReg());
6372     // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6373     if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6374         MI->getOperand(1).getReg().isVirtual())
6375       MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6376     if (MI && MI->getOpcode() == Opcode) {
6377       Patterns.push_back(Pattern);
6378       return true;
6379     }
6380     return false;
6381   };
6382 
6383   typedef AArch64MachineCombinerPattern MCP;
6384 
6385   switch (Root.getOpcode()) {
6386   default:
6387     return false;
6388   case AArch64::FMULv2f32:
6389     Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6390     Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6391     break;
6392   case AArch64::FMULv2f64:
6393     Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6394     Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6395     break;
6396   case AArch64::FMULv4f16:
6397     Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6398     Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6399     break;
6400   case AArch64::FMULv4f32:
6401     Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6402     Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6403     break;
6404   case AArch64::FMULv8f16:
6405     Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6406     Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6407     break;
6408   }
6409 
6410   return Found;
6411 }
6412 
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6413 static bool getFNEGPatterns(MachineInstr &Root,
6414                             SmallVectorImpl<unsigned> &Patterns) {
6415   unsigned Opc = Root.getOpcode();
6416   MachineBasicBlock &MBB = *Root.getParent();
6417   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6418 
6419   auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6420     MachineOperand &MO = Root.getOperand(1);
6421     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6422     if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6423         MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6424         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6425         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6426         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6427         MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6428       Patterns.push_back(Pattern);
6429       return true;
6430     }
6431     return false;
6432   };
6433 
6434   switch (Opc) {
6435   default:
6436     break;
6437   case AArch64::FNEGDr:
6438     return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6439   case AArch64::FNEGSr:
6440     return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6441   }
6442 
6443   return false;
6444 }
6445 
6446 /// Return true when a code sequence can improve throughput. It
6447 /// should be called only for instructions in loops.
6448 /// \param Pattern - combiner pattern
isThroughputPattern(unsigned Pattern) const6449 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6450   switch (Pattern) {
6451   default:
6452     break;
6453   case AArch64MachineCombinerPattern::FMULADDH_OP1:
6454   case AArch64MachineCombinerPattern::FMULADDH_OP2:
6455   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6456   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6457   case AArch64MachineCombinerPattern::FMULADDS_OP1:
6458   case AArch64MachineCombinerPattern::FMULADDS_OP2:
6459   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6460   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6461   case AArch64MachineCombinerPattern::FMULADDD_OP1:
6462   case AArch64MachineCombinerPattern::FMULADDD_OP2:
6463   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6464   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6465   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6466   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6467   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6468   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6469   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6470   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6471   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6472   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6473   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6474   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6475   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6476   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6477   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6478   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6479   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6480   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6481   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6482   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6483   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6484   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6485   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6486   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6487   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6488   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6489   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6490   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6491   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6492   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6493   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6494   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6495   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6496   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6497   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6498   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6499   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6500   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6501   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6502   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6503   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6504   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6505   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6506   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6507   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6508   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6509   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6510   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6511   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6512   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6513   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6514   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6515   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6516   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6517   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6518   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6519   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6520   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6521   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6522   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6523   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6524   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6525   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6526   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6527   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6528   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6529   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6530   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6531   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6532   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6533   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6534   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6535   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6536   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6537   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6538   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6539   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6540   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6541   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6542   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6543   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6544   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6545   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6546   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6547   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6548   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6549   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6550   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6551   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6552   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6553   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6554   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6555   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6556   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6557   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6558     return true;
6559   } // end switch (Pattern)
6560   return false;
6561 }
6562 
6563 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6564 static bool getMiscPatterns(MachineInstr &Root,
6565                             SmallVectorImpl<unsigned> &Patterns) {
6566   // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
6567   unsigned Opc = Root.getOpcode();
6568   MachineBasicBlock &MBB = *Root.getParent();
6569 
6570   switch (Opc) {
6571   case AArch64::SUBWrr:
6572   case AArch64::SUBSWrr:
6573   case AArch64::SUBXrr:
6574   case AArch64::SUBSXrr:
6575     // Found candidate root.
6576     break;
6577   default:
6578     return false;
6579   }
6580 
6581   if (isCombineInstrSettingFlag(Opc) &&
6582       Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
6583           -1)
6584     return false;
6585 
6586   if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6587       canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6588       canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6589       canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6590     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
6591     Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
6592     return true;
6593   }
6594 
6595   return false;
6596 }
6597 
6598 CombinerObjective
getCombinerObjective(unsigned Pattern) const6599 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6600   switch (Pattern) {
6601   case AArch64MachineCombinerPattern::SUBADD_OP1:
6602   case AArch64MachineCombinerPattern::SUBADD_OP2:
6603     return CombinerObjective::MustReduceDepth;
6604   default:
6605     return TargetInstrInfo::getCombinerObjective(Pattern);
6606   }
6607 }
6608 
6609 /// Return true when there is potentially a faster code sequence for an
6610 /// instruction chain ending in \p Root. All potential patterns are listed in
6611 /// the \p Pattern vector. Pattern should be sorted in priority order since the
6612 /// pattern evaluator stops checking as soon as it finds a faster sequence.
6613 
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns,bool DoRegPressureReduce) const6614 bool AArch64InstrInfo::getMachineCombinerPatterns(
6615     MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6616     bool DoRegPressureReduce) const {
6617   // Integer patterns
6618   if (getMaddPatterns(Root, Patterns))
6619     return true;
6620   // Floating point patterns
6621   if (getFMULPatterns(Root, Patterns))
6622     return true;
6623   if (getFMAPatterns(Root, Patterns))
6624     return true;
6625   if (getFNEGPatterns(Root, Patterns))
6626     return true;
6627 
6628   // Other patterns
6629   if (getMiscPatterns(Root, Patterns))
6630     return true;
6631 
6632   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6633                                                      DoRegPressureReduce);
6634 }
6635 
6636 enum class FMAInstKind { Default, Indexed, Accumulator };
6637 /// genFusedMultiply - Generate fused multiply instructions.
6638 /// This function supports both integer and floating point instructions.
6639 /// A typical example:
6640 ///  F|MUL I=A,B,0
6641 ///  F|ADD R,I,C
6642 ///  ==> F|MADD R,A,B,C
6643 /// \param MF Containing MachineFunction
6644 /// \param MRI Register information
6645 /// \param TII Target information
6646 /// \param Root is the F|ADD instruction
6647 /// \param [out] InsInstrs is a vector of machine instructions and will
6648 /// contain the generated madd instruction
6649 /// \param IdxMulOpd is index of operand in Root that is the result of
6650 /// the F|MUL. In the example above IdxMulOpd is 1.
6651 /// \param MaddOpc the opcode fo the f|madd instruction
6652 /// \param RC Register class of operands
6653 /// \param kind of fma instruction (addressing mode) to be generated
6654 /// \param ReplacedAddend is the result register from the instruction
6655 /// replacing the non-combined operand, if any.
6656 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)6657 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6658                  const TargetInstrInfo *TII, MachineInstr &Root,
6659                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6660                  unsigned MaddOpc, const TargetRegisterClass *RC,
6661                  FMAInstKind kind = FMAInstKind::Default,
6662                  const Register *ReplacedAddend = nullptr) {
6663   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6664 
6665   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6666   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6667   Register ResultReg = Root.getOperand(0).getReg();
6668   Register SrcReg0 = MUL->getOperand(1).getReg();
6669   bool Src0IsKill = MUL->getOperand(1).isKill();
6670   Register SrcReg1 = MUL->getOperand(2).getReg();
6671   bool Src1IsKill = MUL->getOperand(2).isKill();
6672 
6673   Register SrcReg2;
6674   bool Src2IsKill;
6675   if (ReplacedAddend) {
6676     // If we just generated a new addend, we must be it's only use.
6677     SrcReg2 = *ReplacedAddend;
6678     Src2IsKill = true;
6679   } else {
6680     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6681     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6682   }
6683 
6684   if (ResultReg.isVirtual())
6685     MRI.constrainRegClass(ResultReg, RC);
6686   if (SrcReg0.isVirtual())
6687     MRI.constrainRegClass(SrcReg0, RC);
6688   if (SrcReg1.isVirtual())
6689     MRI.constrainRegClass(SrcReg1, RC);
6690   if (SrcReg2.isVirtual())
6691     MRI.constrainRegClass(SrcReg2, RC);
6692 
6693   MachineInstrBuilder MIB;
6694   if (kind == FMAInstKind::Default)
6695     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6696               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6697               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6698               .addReg(SrcReg2, getKillRegState(Src2IsKill));
6699   else if (kind == FMAInstKind::Indexed)
6700     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6701               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6702               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6703               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6704               .addImm(MUL->getOperand(3).getImm());
6705   else if (kind == FMAInstKind::Accumulator)
6706     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6707               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6708               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6709               .addReg(SrcReg1, getKillRegState(Src1IsKill));
6710   else
6711     assert(false && "Invalid FMA instruction kind \n");
6712   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6713   InsInstrs.push_back(MIB);
6714   return MUL;
6715 }
6716 
6717 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)6718 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6719                const TargetInstrInfo *TII, MachineInstr &Root,
6720                SmallVectorImpl<MachineInstr *> &InsInstrs) {
6721   MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6722 
6723   unsigned Opc = 0;
6724   const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6725   if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6726     Opc = AArch64::FNMADDSrrr;
6727   else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6728     Opc = AArch64::FNMADDDrrr;
6729   else
6730     return nullptr;
6731 
6732   Register ResultReg = Root.getOperand(0).getReg();
6733   Register SrcReg0 = MAD->getOperand(1).getReg();
6734   Register SrcReg1 = MAD->getOperand(2).getReg();
6735   Register SrcReg2 = MAD->getOperand(3).getReg();
6736   bool Src0IsKill = MAD->getOperand(1).isKill();
6737   bool Src1IsKill = MAD->getOperand(2).isKill();
6738   bool Src2IsKill = MAD->getOperand(3).isKill();
6739   if (ResultReg.isVirtual())
6740     MRI.constrainRegClass(ResultReg, RC);
6741   if (SrcReg0.isVirtual())
6742     MRI.constrainRegClass(SrcReg0, RC);
6743   if (SrcReg1.isVirtual())
6744     MRI.constrainRegClass(SrcReg1, RC);
6745   if (SrcReg2.isVirtual())
6746     MRI.constrainRegClass(SrcReg2, RC);
6747 
6748   MachineInstrBuilder MIB =
6749       BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6750           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6751           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6752           .addReg(SrcReg2, getKillRegState(Src2IsKill));
6753   InsInstrs.push_back(MIB);
6754 
6755   return MAD;
6756 }
6757 
6758 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6759 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)6760 genIndexedMultiply(MachineInstr &Root,
6761                    SmallVectorImpl<MachineInstr *> &InsInstrs,
6762                    unsigned IdxDupOp, unsigned MulOpc,
6763                    const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6764   assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6765          "Invalid index of FMUL operand");
6766 
6767   MachineFunction &MF = *Root.getMF();
6768   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6769 
6770   MachineInstr *Dup =
6771       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6772 
6773   if (Dup->getOpcode() == TargetOpcode::COPY)
6774     Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6775 
6776   Register DupSrcReg = Dup->getOperand(1).getReg();
6777   MRI.clearKillFlags(DupSrcReg);
6778   MRI.constrainRegClass(DupSrcReg, RC);
6779 
6780   unsigned DupSrcLane = Dup->getOperand(2).getImm();
6781 
6782   unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6783   MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6784 
6785   Register ResultReg = Root.getOperand(0).getReg();
6786 
6787   MachineInstrBuilder MIB;
6788   MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6789             .add(MulOp)
6790             .addReg(DupSrcReg)
6791             .addImm(DupSrcLane);
6792 
6793   InsInstrs.push_back(MIB);
6794   return &Root;
6795 }
6796 
6797 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6798 /// instructions.
6799 ///
6800 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6801 static MachineInstr *genFusedMultiplyAcc(
6802     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806                           FMAInstKind::Accumulator);
6807 }
6808 
6809 /// genNeg - Helper to generate an intermediate negation of the second operand
6810 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)6811 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6812                        const TargetInstrInfo *TII, MachineInstr &Root,
6813                        SmallVectorImpl<MachineInstr *> &InsInstrs,
6814                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6815                        unsigned MnegOpc, const TargetRegisterClass *RC) {
6816   Register NewVR = MRI.createVirtualRegister(RC);
6817   MachineInstrBuilder MIB =
6818       BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6819           .add(Root.getOperand(2));
6820   InsInstrs.push_back(MIB);
6821 
6822   assert(InstrIdxForVirtReg.empty());
6823   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6824 
6825   return NewVR;
6826 }
6827 
6828 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6829 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6830 static MachineInstr *genFusedMultiplyAccNeg(
6831     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6832     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6833     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6834     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6835   assert(IdxMulOpd == 1);
6836 
6837   Register NewVR =
6838       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6839   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840                           FMAInstKind::Accumulator, &NewVR);
6841 }
6842 
6843 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6844 /// instructions.
6845 ///
6846 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6847 static MachineInstr *genFusedMultiplyIdx(
6848     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6849     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6850     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6851   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6852                           FMAInstKind::Indexed);
6853 }
6854 
6855 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6856 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6857 static MachineInstr *genFusedMultiplyIdxNeg(
6858     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6859     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6860     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6861     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6862   assert(IdxMulOpd == 1);
6863 
6864   Register NewVR =
6865       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6866 
6867   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6868                           FMAInstKind::Indexed, &NewVR);
6869 }
6870 
6871 /// genMaddR - Generate madd instruction and combine mul and add using
6872 /// an extra virtual register
6873 /// Example - an ADD intermediate needs to be stored in a register:
6874 ///   MUL I=A,B,0
6875 ///   ADD R,I,Imm
6876 ///   ==> ORR  V, ZR, Imm
6877 ///   ==> MADD R,A,B,V
6878 /// \param MF Containing MachineFunction
6879 /// \param MRI Register information
6880 /// \param TII Target information
6881 /// \param Root is the ADD instruction
6882 /// \param [out] InsInstrs is a vector of machine instructions and will
6883 /// contain the generated madd instruction
6884 /// \param IdxMulOpd is index of operand in Root that is the result of
6885 /// the MUL. In the example above IdxMulOpd is 1.
6886 /// \param MaddOpc the opcode fo the madd instruction
6887 /// \param VR is a virtual register that holds the value of an ADD operand
6888 /// (V in the example above).
6889 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)6890 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6891                               const TargetInstrInfo *TII, MachineInstr &Root,
6892                               SmallVectorImpl<MachineInstr *> &InsInstrs,
6893                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6894                               const TargetRegisterClass *RC) {
6895   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6896 
6897   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6898   Register ResultReg = Root.getOperand(0).getReg();
6899   Register SrcReg0 = MUL->getOperand(1).getReg();
6900   bool Src0IsKill = MUL->getOperand(1).isKill();
6901   Register SrcReg1 = MUL->getOperand(2).getReg();
6902   bool Src1IsKill = MUL->getOperand(2).isKill();
6903 
6904   if (ResultReg.isVirtual())
6905     MRI.constrainRegClass(ResultReg, RC);
6906   if (SrcReg0.isVirtual())
6907     MRI.constrainRegClass(SrcReg0, RC);
6908   if (SrcReg1.isVirtual())
6909     MRI.constrainRegClass(SrcReg1, RC);
6910   if (Register::isVirtualRegister(VR))
6911     MRI.constrainRegClass(VR, RC);
6912 
6913   MachineInstrBuilder MIB =
6914       BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6915           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6916           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6917           .addReg(VR);
6918   // Insert the MADD
6919   InsInstrs.push_back(MIB);
6920   return MUL;
6921 }
6922 
6923 /// Do the following transformation
6924 /// A - (B + C)  ==>   (A - B) - C
6925 /// A - (B + C)  ==>   (A - C) - B
6926 static void
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg)6927 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6928                  const TargetInstrInfo *TII, MachineInstr &Root,
6929                  SmallVectorImpl<MachineInstr *> &InsInstrs,
6930                  SmallVectorImpl<MachineInstr *> &DelInstrs,
6931                  unsigned IdxOpd1,
6932                  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6933   assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6934   unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6935   MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6936 
6937   Register ResultReg = Root.getOperand(0).getReg();
6938   Register RegA = Root.getOperand(1).getReg();
6939   bool RegAIsKill = Root.getOperand(1).isKill();
6940   Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6941   bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6942   Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6943   bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6944   Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6945 
6946   unsigned Opcode = Root.getOpcode();
6947   if (Opcode == AArch64::SUBSWrr)
6948     Opcode = AArch64::SUBWrr;
6949   else if (Opcode == AArch64::SUBSXrr)
6950     Opcode = AArch64::SUBXrr;
6951   else
6952     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6953            "Unexpected instruction opcode.");
6954 
6955   uint32_t Flags = Root.mergeFlagsWith(*AddMI);
6956   Flags &= ~MachineInstr::NoSWrap;
6957   Flags &= ~MachineInstr::NoUWrap;
6958 
6959   MachineInstrBuilder MIB1 =
6960       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6961           .addReg(RegA, getKillRegState(RegAIsKill))
6962           .addReg(RegB, getKillRegState(RegBIsKill))
6963           .setMIFlags(Flags);
6964   MachineInstrBuilder MIB2 =
6965       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6966           .addReg(NewVR, getKillRegState(true))
6967           .addReg(RegC, getKillRegState(RegCIsKill))
6968           .setMIFlags(Flags);
6969 
6970   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6971   InsInstrs.push_back(MIB1);
6972   InsInstrs.push_back(MIB2);
6973   DelInstrs.push_back(AddMI);
6974   DelInstrs.push_back(&Root);
6975 }
6976 
6977 /// When getMachineCombinerPatterns() finds potential patterns,
6978 /// this function generates the instructions that could replace the
6979 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,unsigned Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const6980 void AArch64InstrInfo::genAlternativeCodeSequence(
6981     MachineInstr &Root, unsigned Pattern,
6982     SmallVectorImpl<MachineInstr *> &InsInstrs,
6983     SmallVectorImpl<MachineInstr *> &DelInstrs,
6984     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6985   MachineBasicBlock &MBB = *Root.getParent();
6986   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6987   MachineFunction &MF = *MBB.getParent();
6988   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6989 
6990   MachineInstr *MUL = nullptr;
6991   const TargetRegisterClass *RC;
6992   unsigned Opc;
6993   switch (Pattern) {
6994   default:
6995     // Reassociate instructions.
6996     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6997                                                 DelInstrs, InstrIdxForVirtReg);
6998     return;
6999   case AArch64MachineCombinerPattern::SUBADD_OP1:
7000     // A - (B + C)
7001     // ==> (A - B) - C
7002     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7003                      InstrIdxForVirtReg);
7004     return;
7005   case AArch64MachineCombinerPattern::SUBADD_OP2:
7006     // A - (B + C)
7007     // ==> (A - C) - B
7008     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7009                      InstrIdxForVirtReg);
7010     return;
7011   case AArch64MachineCombinerPattern::MULADDW_OP1:
7012   case AArch64MachineCombinerPattern::MULADDX_OP1:
7013     // MUL I=A,B,0
7014     // ADD R,I,C
7015     // ==> MADD R,A,B,C
7016     // --- Create(MADD);
7017     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7018       Opc = AArch64::MADDWrrr;
7019       RC = &AArch64::GPR32RegClass;
7020     } else {
7021       Opc = AArch64::MADDXrrr;
7022       RC = &AArch64::GPR64RegClass;
7023     }
7024     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7025     break;
7026   case AArch64MachineCombinerPattern::MULADDW_OP2:
7027   case AArch64MachineCombinerPattern::MULADDX_OP2:
7028     // MUL I=A,B,0
7029     // ADD R,C,I
7030     // ==> MADD R,A,B,C
7031     // --- Create(MADD);
7032     if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7033       Opc = AArch64::MADDWrrr;
7034       RC = &AArch64::GPR32RegClass;
7035     } else {
7036       Opc = AArch64::MADDXrrr;
7037       RC = &AArch64::GPR64RegClass;
7038     }
7039     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7040     break;
7041   case AArch64MachineCombinerPattern::MULADDWI_OP1:
7042   case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7043     // MUL I=A,B,0
7044     // ADD R,I,Imm
7045     // ==> MOV V, Imm
7046     // ==> MADD R,A,B,V
7047     // --- Create(MADD);
7048     const TargetRegisterClass *OrrRC;
7049     unsigned BitSize, OrrOpc, ZeroReg;
7050     if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7051       OrrOpc = AArch64::ORRWri;
7052       OrrRC = &AArch64::GPR32spRegClass;
7053       BitSize = 32;
7054       ZeroReg = AArch64::WZR;
7055       Opc = AArch64::MADDWrrr;
7056       RC = &AArch64::GPR32RegClass;
7057     } else {
7058       OrrOpc = AArch64::ORRXri;
7059       OrrRC = &AArch64::GPR64spRegClass;
7060       BitSize = 64;
7061       ZeroReg = AArch64::XZR;
7062       Opc = AArch64::MADDXrrr;
7063       RC = &AArch64::GPR64RegClass;
7064     }
7065     Register NewVR = MRI.createVirtualRegister(OrrRC);
7066     uint64_t Imm = Root.getOperand(2).getImm();
7067 
7068     if (Root.getOperand(3).isImm()) {
7069       unsigned Val = Root.getOperand(3).getImm();
7070       Imm = Imm << Val;
7071     }
7072     uint64_t UImm = SignExtend64(Imm, BitSize);
7073     // The immediate can be composed via a single instruction.
7074     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7075     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7076     if (Insn.size() != 1)
7077       return;
7078     auto MovI = Insn.begin();
7079     MachineInstrBuilder MIB1;
7080     // MOV is an alias for one of three instructions: movz, movn, and orr.
7081     if (MovI->Opcode == OrrOpc)
7082       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7083                  .addReg(ZeroReg)
7084                  .addImm(MovI->Op2);
7085     else {
7086       if (BitSize == 32)
7087         assert((MovI->Opcode == AArch64::MOVNWi ||
7088                 MovI->Opcode == AArch64::MOVZWi) &&
7089                "Expected opcode");
7090       else
7091         assert((MovI->Opcode == AArch64::MOVNXi ||
7092                 MovI->Opcode == AArch64::MOVZXi) &&
7093                "Expected opcode");
7094       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7095                  .addImm(MovI->Op1)
7096                  .addImm(MovI->Op2);
7097     }
7098     InsInstrs.push_back(MIB1);
7099     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7100     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7101     break;
7102   }
7103   case AArch64MachineCombinerPattern::MULSUBW_OP1:
7104   case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7105     // MUL I=A,B,0
7106     // SUB R,I, C
7107     // ==> SUB  V, 0, C
7108     // ==> MADD R,A,B,V // = -C + A*B
7109     // --- Create(MADD);
7110     const TargetRegisterClass *SubRC;
7111     unsigned SubOpc, ZeroReg;
7112     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7113       SubOpc = AArch64::SUBWrr;
7114       SubRC = &AArch64::GPR32spRegClass;
7115       ZeroReg = AArch64::WZR;
7116       Opc = AArch64::MADDWrrr;
7117       RC = &AArch64::GPR32RegClass;
7118     } else {
7119       SubOpc = AArch64::SUBXrr;
7120       SubRC = &AArch64::GPR64spRegClass;
7121       ZeroReg = AArch64::XZR;
7122       Opc = AArch64::MADDXrrr;
7123       RC = &AArch64::GPR64RegClass;
7124     }
7125     Register NewVR = MRI.createVirtualRegister(SubRC);
7126     // SUB NewVR, 0, C
7127     MachineInstrBuilder MIB1 =
7128         BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7129             .addReg(ZeroReg)
7130             .add(Root.getOperand(2));
7131     InsInstrs.push_back(MIB1);
7132     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7133     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7134     break;
7135   }
7136   case AArch64MachineCombinerPattern::MULSUBW_OP2:
7137   case AArch64MachineCombinerPattern::MULSUBX_OP2:
7138     // MUL I=A,B,0
7139     // SUB R,C,I
7140     // ==> MSUB R,A,B,C (computes C - A*B)
7141     // --- Create(MSUB);
7142     if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7143       Opc = AArch64::MSUBWrrr;
7144       RC = &AArch64::GPR32RegClass;
7145     } else {
7146       Opc = AArch64::MSUBXrrr;
7147       RC = &AArch64::GPR64RegClass;
7148     }
7149     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7150     break;
7151   case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7152   case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7153     // MUL I=A,B,0
7154     // SUB R,I, Imm
7155     // ==> MOV  V, -Imm
7156     // ==> MADD R,A,B,V // = -Imm + A*B
7157     // --- Create(MADD);
7158     const TargetRegisterClass *OrrRC;
7159     unsigned BitSize, OrrOpc, ZeroReg;
7160     if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7161       OrrOpc = AArch64::ORRWri;
7162       OrrRC = &AArch64::GPR32spRegClass;
7163       BitSize = 32;
7164       ZeroReg = AArch64::WZR;
7165       Opc = AArch64::MADDWrrr;
7166       RC = &AArch64::GPR32RegClass;
7167     } else {
7168       OrrOpc = AArch64::ORRXri;
7169       OrrRC = &AArch64::GPR64spRegClass;
7170       BitSize = 64;
7171       ZeroReg = AArch64::XZR;
7172       Opc = AArch64::MADDXrrr;
7173       RC = &AArch64::GPR64RegClass;
7174     }
7175     Register NewVR = MRI.createVirtualRegister(OrrRC);
7176     uint64_t Imm = Root.getOperand(2).getImm();
7177     if (Root.getOperand(3).isImm()) {
7178       unsigned Val = Root.getOperand(3).getImm();
7179       Imm = Imm << Val;
7180     }
7181     uint64_t UImm = SignExtend64(-Imm, BitSize);
7182     // The immediate can be composed via a single instruction.
7183     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7184     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7185     if (Insn.size() != 1)
7186       return;
7187     auto MovI = Insn.begin();
7188     MachineInstrBuilder MIB1;
7189     // MOV is an alias for one of three instructions: movz, movn, and orr.
7190     if (MovI->Opcode == OrrOpc)
7191       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7192                  .addReg(ZeroReg)
7193                  .addImm(MovI->Op2);
7194     else {
7195       if (BitSize == 32)
7196         assert((MovI->Opcode == AArch64::MOVNWi ||
7197                 MovI->Opcode == AArch64::MOVZWi) &&
7198                "Expected opcode");
7199       else
7200         assert((MovI->Opcode == AArch64::MOVNXi ||
7201                 MovI->Opcode == AArch64::MOVZXi) &&
7202                "Expected opcode");
7203       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7204                  .addImm(MovI->Op1)
7205                  .addImm(MovI->Op2);
7206     }
7207     InsInstrs.push_back(MIB1);
7208     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7209     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7210     break;
7211   }
7212 
7213   case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7214     Opc = AArch64::MLAv8i8;
7215     RC = &AArch64::FPR64RegClass;
7216     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7217     break;
7218   case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7219     Opc = AArch64::MLAv8i8;
7220     RC = &AArch64::FPR64RegClass;
7221     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7222     break;
7223   case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7224     Opc = AArch64::MLAv16i8;
7225     RC = &AArch64::FPR128RegClass;
7226     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7227     break;
7228   case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7229     Opc = AArch64::MLAv16i8;
7230     RC = &AArch64::FPR128RegClass;
7231     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7232     break;
7233   case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7234     Opc = AArch64::MLAv4i16;
7235     RC = &AArch64::FPR64RegClass;
7236     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7237     break;
7238   case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7239     Opc = AArch64::MLAv4i16;
7240     RC = &AArch64::FPR64RegClass;
7241     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7242     break;
7243   case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7244     Opc = AArch64::MLAv8i16;
7245     RC = &AArch64::FPR128RegClass;
7246     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7247     break;
7248   case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7249     Opc = AArch64::MLAv8i16;
7250     RC = &AArch64::FPR128RegClass;
7251     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7252     break;
7253   case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7254     Opc = AArch64::MLAv2i32;
7255     RC = &AArch64::FPR64RegClass;
7256     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7257     break;
7258   case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7259     Opc = AArch64::MLAv2i32;
7260     RC = &AArch64::FPR64RegClass;
7261     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7262     break;
7263   case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7264     Opc = AArch64::MLAv4i32;
7265     RC = &AArch64::FPR128RegClass;
7266     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7267     break;
7268   case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7269     Opc = AArch64::MLAv4i32;
7270     RC = &AArch64::FPR128RegClass;
7271     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7272     break;
7273 
7274   case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7275     Opc = AArch64::MLAv8i8;
7276     RC = &AArch64::FPR64RegClass;
7277     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7278                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7279                                  RC);
7280     break;
7281   case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7282     Opc = AArch64::MLSv8i8;
7283     RC = &AArch64::FPR64RegClass;
7284     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7285     break;
7286   case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7287     Opc = AArch64::MLAv16i8;
7288     RC = &AArch64::FPR128RegClass;
7289     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7290                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7291                                  RC);
7292     break;
7293   case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7294     Opc = AArch64::MLSv16i8;
7295     RC = &AArch64::FPR128RegClass;
7296     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7297     break;
7298   case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7299     Opc = AArch64::MLAv4i16;
7300     RC = &AArch64::FPR64RegClass;
7301     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7302                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7303                                  RC);
7304     break;
7305   case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7306     Opc = AArch64::MLSv4i16;
7307     RC = &AArch64::FPR64RegClass;
7308     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7309     break;
7310   case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7311     Opc = AArch64::MLAv8i16;
7312     RC = &AArch64::FPR128RegClass;
7313     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7314                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7315                                  RC);
7316     break;
7317   case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7318     Opc = AArch64::MLSv8i16;
7319     RC = &AArch64::FPR128RegClass;
7320     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7321     break;
7322   case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7323     Opc = AArch64::MLAv2i32;
7324     RC = &AArch64::FPR64RegClass;
7325     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7326                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7327                                  RC);
7328     break;
7329   case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7330     Opc = AArch64::MLSv2i32;
7331     RC = &AArch64::FPR64RegClass;
7332     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7333     break;
7334   case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7335     Opc = AArch64::MLAv4i32;
7336     RC = &AArch64::FPR128RegClass;
7337     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7338                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7339                                  RC);
7340     break;
7341   case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7342     Opc = AArch64::MLSv4i32;
7343     RC = &AArch64::FPR128RegClass;
7344     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7345     break;
7346 
7347   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7348     Opc = AArch64::MLAv4i16_indexed;
7349     RC = &AArch64::FPR64RegClass;
7350     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7351     break;
7352   case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7353     Opc = AArch64::MLAv4i16_indexed;
7354     RC = &AArch64::FPR64RegClass;
7355     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7356     break;
7357   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7358     Opc = AArch64::MLAv8i16_indexed;
7359     RC = &AArch64::FPR128RegClass;
7360     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7361     break;
7362   case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7363     Opc = AArch64::MLAv8i16_indexed;
7364     RC = &AArch64::FPR128RegClass;
7365     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7366     break;
7367   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7368     Opc = AArch64::MLAv2i32_indexed;
7369     RC = &AArch64::FPR64RegClass;
7370     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7371     break;
7372   case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7373     Opc = AArch64::MLAv2i32_indexed;
7374     RC = &AArch64::FPR64RegClass;
7375     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7376     break;
7377   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7378     Opc = AArch64::MLAv4i32_indexed;
7379     RC = &AArch64::FPR128RegClass;
7380     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7381     break;
7382   case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7383     Opc = AArch64::MLAv4i32_indexed;
7384     RC = &AArch64::FPR128RegClass;
7385     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7386     break;
7387 
7388   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7389     Opc = AArch64::MLAv4i16_indexed;
7390     RC = &AArch64::FPR64RegClass;
7391     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7392                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7393                                  RC);
7394     break;
7395   case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7396     Opc = AArch64::MLSv4i16_indexed;
7397     RC = &AArch64::FPR64RegClass;
7398     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7399     break;
7400   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7401     Opc = AArch64::MLAv8i16_indexed;
7402     RC = &AArch64::FPR128RegClass;
7403     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7404                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7405                                  RC);
7406     break;
7407   case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7408     Opc = AArch64::MLSv8i16_indexed;
7409     RC = &AArch64::FPR128RegClass;
7410     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7411     break;
7412   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7413     Opc = AArch64::MLAv2i32_indexed;
7414     RC = &AArch64::FPR64RegClass;
7415     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7416                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7417                                  RC);
7418     break;
7419   case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7420     Opc = AArch64::MLSv2i32_indexed;
7421     RC = &AArch64::FPR64RegClass;
7422     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7423     break;
7424   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7425     Opc = AArch64::MLAv4i32_indexed;
7426     RC = &AArch64::FPR128RegClass;
7427     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7428                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7429                                  RC);
7430     break;
7431   case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7432     Opc = AArch64::MLSv4i32_indexed;
7433     RC = &AArch64::FPR128RegClass;
7434     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7435     break;
7436 
7437   // Floating Point Support
7438   case AArch64MachineCombinerPattern::FMULADDH_OP1:
7439     Opc = AArch64::FMADDHrrr;
7440     RC = &AArch64::FPR16RegClass;
7441     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7442     break;
7443   case AArch64MachineCombinerPattern::FMULADDS_OP1:
7444     Opc = AArch64::FMADDSrrr;
7445     RC = &AArch64::FPR32RegClass;
7446     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7447     break;
7448   case AArch64MachineCombinerPattern::FMULADDD_OP1:
7449     Opc = AArch64::FMADDDrrr;
7450     RC = &AArch64::FPR64RegClass;
7451     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7452     break;
7453 
7454   case AArch64MachineCombinerPattern::FMULADDH_OP2:
7455     Opc = AArch64::FMADDHrrr;
7456     RC = &AArch64::FPR16RegClass;
7457     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7458     break;
7459   case AArch64MachineCombinerPattern::FMULADDS_OP2:
7460     Opc = AArch64::FMADDSrrr;
7461     RC = &AArch64::FPR32RegClass;
7462     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7463     break;
7464   case AArch64MachineCombinerPattern::FMULADDD_OP2:
7465     Opc = AArch64::FMADDDrrr;
7466     RC = &AArch64::FPR64RegClass;
7467     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7468     break;
7469 
7470   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7471     Opc = AArch64::FMLAv1i32_indexed;
7472     RC = &AArch64::FPR32RegClass;
7473     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7474                            FMAInstKind::Indexed);
7475     break;
7476   case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7477     Opc = AArch64::FMLAv1i32_indexed;
7478     RC = &AArch64::FPR32RegClass;
7479     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7480                            FMAInstKind::Indexed);
7481     break;
7482 
7483   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7484     Opc = AArch64::FMLAv1i64_indexed;
7485     RC = &AArch64::FPR64RegClass;
7486     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7487                            FMAInstKind::Indexed);
7488     break;
7489   case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7490     Opc = AArch64::FMLAv1i64_indexed;
7491     RC = &AArch64::FPR64RegClass;
7492     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7493                            FMAInstKind::Indexed);
7494     break;
7495 
7496   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7497     RC = &AArch64::FPR64RegClass;
7498     Opc = AArch64::FMLAv4i16_indexed;
7499     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500                            FMAInstKind::Indexed);
7501     break;
7502   case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7503     RC = &AArch64::FPR64RegClass;
7504     Opc = AArch64::FMLAv4f16;
7505     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7506                            FMAInstKind::Accumulator);
7507     break;
7508   case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7509     RC = &AArch64::FPR64RegClass;
7510     Opc = AArch64::FMLAv4i16_indexed;
7511     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512                            FMAInstKind::Indexed);
7513     break;
7514   case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7515     RC = &AArch64::FPR64RegClass;
7516     Opc = AArch64::FMLAv4f16;
7517     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7518                            FMAInstKind::Accumulator);
7519     break;
7520 
7521   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7522   case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7523     RC = &AArch64::FPR64RegClass;
7524     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7525       Opc = AArch64::FMLAv2i32_indexed;
7526       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7527                              FMAInstKind::Indexed);
7528     } else {
7529       Opc = AArch64::FMLAv2f32;
7530       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7531                              FMAInstKind::Accumulator);
7532     }
7533     break;
7534   case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7535   case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7536     RC = &AArch64::FPR64RegClass;
7537     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7538       Opc = AArch64::FMLAv2i32_indexed;
7539       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7540                              FMAInstKind::Indexed);
7541     } else {
7542       Opc = AArch64::FMLAv2f32;
7543       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7544                              FMAInstKind::Accumulator);
7545     }
7546     break;
7547 
7548   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7549     RC = &AArch64::FPR128RegClass;
7550     Opc = AArch64::FMLAv8i16_indexed;
7551     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552                            FMAInstKind::Indexed);
7553     break;
7554   case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7555     RC = &AArch64::FPR128RegClass;
7556     Opc = AArch64::FMLAv8f16;
7557     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7558                            FMAInstKind::Accumulator);
7559     break;
7560   case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7561     RC = &AArch64::FPR128RegClass;
7562     Opc = AArch64::FMLAv8i16_indexed;
7563     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7564                            FMAInstKind::Indexed);
7565     break;
7566   case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7567     RC = &AArch64::FPR128RegClass;
7568     Opc = AArch64::FMLAv8f16;
7569     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7570                            FMAInstKind::Accumulator);
7571     break;
7572 
7573   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7574   case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7575     RC = &AArch64::FPR128RegClass;
7576     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7577       Opc = AArch64::FMLAv2i64_indexed;
7578       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7579                              FMAInstKind::Indexed);
7580     } else {
7581       Opc = AArch64::FMLAv2f64;
7582       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7583                              FMAInstKind::Accumulator);
7584     }
7585     break;
7586   case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7587   case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7588     RC = &AArch64::FPR128RegClass;
7589     if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7590       Opc = AArch64::FMLAv2i64_indexed;
7591       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7592                              FMAInstKind::Indexed);
7593     } else {
7594       Opc = AArch64::FMLAv2f64;
7595       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7596                              FMAInstKind::Accumulator);
7597     }
7598     break;
7599 
7600   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7601   case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7602     RC = &AArch64::FPR128RegClass;
7603     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7604       Opc = AArch64::FMLAv4i32_indexed;
7605       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7606                              FMAInstKind::Indexed);
7607     } else {
7608       Opc = AArch64::FMLAv4f32;
7609       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7610                              FMAInstKind::Accumulator);
7611     }
7612     break;
7613 
7614   case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7615   case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7616     RC = &AArch64::FPR128RegClass;
7617     if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7618       Opc = AArch64::FMLAv4i32_indexed;
7619       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7620                              FMAInstKind::Indexed);
7621     } else {
7622       Opc = AArch64::FMLAv4f32;
7623       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7624                              FMAInstKind::Accumulator);
7625     }
7626     break;
7627 
7628   case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7629     Opc = AArch64::FNMSUBHrrr;
7630     RC = &AArch64::FPR16RegClass;
7631     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7632     break;
7633   case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7634     Opc = AArch64::FNMSUBSrrr;
7635     RC = &AArch64::FPR32RegClass;
7636     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7637     break;
7638   case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7639     Opc = AArch64::FNMSUBDrrr;
7640     RC = &AArch64::FPR64RegClass;
7641     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7642     break;
7643 
7644   case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7645     Opc = AArch64::FNMADDHrrr;
7646     RC = &AArch64::FPR16RegClass;
7647     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7648     break;
7649   case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7650     Opc = AArch64::FNMADDSrrr;
7651     RC = &AArch64::FPR32RegClass;
7652     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7653     break;
7654   case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7655     Opc = AArch64::FNMADDDrrr;
7656     RC = &AArch64::FPR64RegClass;
7657     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7658     break;
7659 
7660   case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7661     Opc = AArch64::FMSUBHrrr;
7662     RC = &AArch64::FPR16RegClass;
7663     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7664     break;
7665   case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7666     Opc = AArch64::FMSUBSrrr;
7667     RC = &AArch64::FPR32RegClass;
7668     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7669     break;
7670   case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7671     Opc = AArch64::FMSUBDrrr;
7672     RC = &AArch64::FPR64RegClass;
7673     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7674     break;
7675 
7676   case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7677     Opc = AArch64::FMLSv1i32_indexed;
7678     RC = &AArch64::FPR32RegClass;
7679     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7680                            FMAInstKind::Indexed);
7681     break;
7682 
7683   case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7684     Opc = AArch64::FMLSv1i64_indexed;
7685     RC = &AArch64::FPR64RegClass;
7686     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7687                            FMAInstKind::Indexed);
7688     break;
7689 
7690   case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7691   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7692     RC = &AArch64::FPR64RegClass;
7693     Register NewVR = MRI.createVirtualRegister(RC);
7694     MachineInstrBuilder MIB1 =
7695         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7696             .add(Root.getOperand(2));
7697     InsInstrs.push_back(MIB1);
7698     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7699     if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7700       Opc = AArch64::FMLAv4f16;
7701       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7702                              FMAInstKind::Accumulator, &NewVR);
7703     } else {
7704       Opc = AArch64::FMLAv4i16_indexed;
7705       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7706                              FMAInstKind::Indexed, &NewVR);
7707     }
7708     break;
7709   }
7710   case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7711     RC = &AArch64::FPR64RegClass;
7712     Opc = AArch64::FMLSv4f16;
7713     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7714                            FMAInstKind::Accumulator);
7715     break;
7716   case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7717     RC = &AArch64::FPR64RegClass;
7718     Opc = AArch64::FMLSv4i16_indexed;
7719     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7720                            FMAInstKind::Indexed);
7721     break;
7722 
7723   case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7724   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7725     RC = &AArch64::FPR64RegClass;
7726     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7727       Opc = AArch64::FMLSv2i32_indexed;
7728       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7729                              FMAInstKind::Indexed);
7730     } else {
7731       Opc = AArch64::FMLSv2f32;
7732       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7733                              FMAInstKind::Accumulator);
7734     }
7735     break;
7736 
7737   case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7738   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7739     RC = &AArch64::FPR128RegClass;
7740     Register NewVR = MRI.createVirtualRegister(RC);
7741     MachineInstrBuilder MIB1 =
7742         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7743             .add(Root.getOperand(2));
7744     InsInstrs.push_back(MIB1);
7745     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7746     if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7747       Opc = AArch64::FMLAv8f16;
7748       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7749                              FMAInstKind::Accumulator, &NewVR);
7750     } else {
7751       Opc = AArch64::FMLAv8i16_indexed;
7752       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7753                              FMAInstKind::Indexed, &NewVR);
7754     }
7755     break;
7756   }
7757   case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7758     RC = &AArch64::FPR128RegClass;
7759     Opc = AArch64::FMLSv8f16;
7760     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7761                            FMAInstKind::Accumulator);
7762     break;
7763   case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7764     RC = &AArch64::FPR128RegClass;
7765     Opc = AArch64::FMLSv8i16_indexed;
7766     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7767                            FMAInstKind::Indexed);
7768     break;
7769 
7770   case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7771   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7772     RC = &AArch64::FPR128RegClass;
7773     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7774       Opc = AArch64::FMLSv2i64_indexed;
7775       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7776                              FMAInstKind::Indexed);
7777     } else {
7778       Opc = AArch64::FMLSv2f64;
7779       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7780                              FMAInstKind::Accumulator);
7781     }
7782     break;
7783 
7784   case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7785   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7786     RC = &AArch64::FPR128RegClass;
7787     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7788       Opc = AArch64::FMLSv4i32_indexed;
7789       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7790                              FMAInstKind::Indexed);
7791     } else {
7792       Opc = AArch64::FMLSv4f32;
7793       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7794                              FMAInstKind::Accumulator);
7795     }
7796     break;
7797   case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7798   case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7799     RC = &AArch64::FPR64RegClass;
7800     Register NewVR = MRI.createVirtualRegister(RC);
7801     MachineInstrBuilder MIB1 =
7802         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7803             .add(Root.getOperand(2));
7804     InsInstrs.push_back(MIB1);
7805     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7806     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7807       Opc = AArch64::FMLAv2i32_indexed;
7808       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7809                              FMAInstKind::Indexed, &NewVR);
7810     } else {
7811       Opc = AArch64::FMLAv2f32;
7812       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7813                              FMAInstKind::Accumulator, &NewVR);
7814     }
7815     break;
7816   }
7817   case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7818   case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7819     RC = &AArch64::FPR128RegClass;
7820     Register NewVR = MRI.createVirtualRegister(RC);
7821     MachineInstrBuilder MIB1 =
7822         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7823             .add(Root.getOperand(2));
7824     InsInstrs.push_back(MIB1);
7825     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7826     if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7827       Opc = AArch64::FMLAv4i32_indexed;
7828       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7829                              FMAInstKind::Indexed, &NewVR);
7830     } else {
7831       Opc = AArch64::FMLAv4f32;
7832       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7833                              FMAInstKind::Accumulator, &NewVR);
7834     }
7835     break;
7836   }
7837   case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7838   case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7839     RC = &AArch64::FPR128RegClass;
7840     Register NewVR = MRI.createVirtualRegister(RC);
7841     MachineInstrBuilder MIB1 =
7842         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7843             .add(Root.getOperand(2));
7844     InsInstrs.push_back(MIB1);
7845     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7846     if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7847       Opc = AArch64::FMLAv2i64_indexed;
7848       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7849                              FMAInstKind::Indexed, &NewVR);
7850     } else {
7851       Opc = AArch64::FMLAv2f64;
7852       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7853                              FMAInstKind::Accumulator, &NewVR);
7854     }
7855     break;
7856   }
7857   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7858   case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7859     unsigned IdxDupOp =
7860         (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7861                                                                           : 2;
7862     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7863                        &AArch64::FPR128RegClass, MRI);
7864     break;
7865   }
7866   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7867   case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7868     unsigned IdxDupOp =
7869         (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7870                                                                           : 2;
7871     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7872                        &AArch64::FPR128RegClass, MRI);
7873     break;
7874   }
7875   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7876   case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7877     unsigned IdxDupOp =
7878         (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7879                                                                           : 2;
7880     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7881                        &AArch64::FPR128_loRegClass, MRI);
7882     break;
7883   }
7884   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7885   case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7886     unsigned IdxDupOp =
7887         (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7888                                                                           : 2;
7889     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7890                        &AArch64::FPR128RegClass, MRI);
7891     break;
7892   }
7893   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7894   case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7895     unsigned IdxDupOp =
7896         (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7897                                                                           : 2;
7898     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7899                        &AArch64::FPR128_loRegClass, MRI);
7900     break;
7901   }
7902   case AArch64MachineCombinerPattern::FNMADD: {
7903     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7904     break;
7905   }
7906 
7907   } // end switch (Pattern)
7908   // Record MUL and ADD/SUB for deletion
7909   if (MUL)
7910     DelInstrs.push_back(MUL);
7911   DelInstrs.push_back(&Root);
7912 
7913   // Set the flags on the inserted instructions to be the merged flags of the
7914   // instructions that we have combined.
7915   uint32_t Flags = Root.getFlags();
7916   if (MUL)
7917     Flags = Root.mergeFlagsWith(*MUL);
7918   for (auto *MI : InsInstrs)
7919     MI->setFlags(Flags);
7920 }
7921 
7922 /// Replace csincr-branch sequence by simple conditional branch
7923 ///
7924 /// Examples:
7925 /// 1. \code
7926 ///   csinc  w9, wzr, wzr, <condition code>
7927 ///   tbnz   w9, #0, 0x44
7928 ///    \endcode
7929 /// to
7930 ///    \code
7931 ///   b.<inverted condition code>
7932 ///    \endcode
7933 ///
7934 /// 2. \code
7935 ///   csinc w9, wzr, wzr, <condition code>
7936 ///   tbz   w9, #0, 0x44
7937 ///    \endcode
7938 /// to
7939 ///    \code
7940 ///   b.<condition code>
7941 ///    \endcode
7942 ///
7943 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7944 /// compare's constant operand is power of 2.
7945 ///
7946 /// Examples:
7947 ///    \code
7948 ///   and  w8, w8, #0x400
7949 ///   cbnz w8, L1
7950 ///    \endcode
7951 /// to
7952 ///    \code
7953 ///   tbnz w8, #10, L1
7954 ///    \endcode
7955 ///
7956 /// \param  MI Conditional Branch
7957 /// \return True when the simple conditional branch is generated
7958 ///
optimizeCondBranch(MachineInstr & MI) const7959 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7960   bool IsNegativeBranch = false;
7961   bool IsTestAndBranch = false;
7962   unsigned TargetBBInMI = 0;
7963   switch (MI.getOpcode()) {
7964   default:
7965     llvm_unreachable("Unknown branch instruction?");
7966   case AArch64::Bcc:
7967     return false;
7968   case AArch64::CBZW:
7969   case AArch64::CBZX:
7970     TargetBBInMI = 1;
7971     break;
7972   case AArch64::CBNZW:
7973   case AArch64::CBNZX:
7974     TargetBBInMI = 1;
7975     IsNegativeBranch = true;
7976     break;
7977   case AArch64::TBZW:
7978   case AArch64::TBZX:
7979     TargetBBInMI = 2;
7980     IsTestAndBranch = true;
7981     break;
7982   case AArch64::TBNZW:
7983   case AArch64::TBNZX:
7984     TargetBBInMI = 2;
7985     IsNegativeBranch = true;
7986     IsTestAndBranch = true;
7987     break;
7988   }
7989   // So we increment a zero register and test for bits other
7990   // than bit 0? Conservatively bail out in case the verifier
7991   // missed this case.
7992   if (IsTestAndBranch && MI.getOperand(1).getImm())
7993     return false;
7994 
7995   // Find Definition.
7996   assert(MI.getParent() && "Incomplete machine instruciton\n");
7997   MachineBasicBlock *MBB = MI.getParent();
7998   MachineFunction *MF = MBB->getParent();
7999   MachineRegisterInfo *MRI = &MF->getRegInfo();
8000   Register VReg = MI.getOperand(0).getReg();
8001   if (!VReg.isVirtual())
8002     return false;
8003 
8004   MachineInstr *DefMI = MRI->getVRegDef(VReg);
8005 
8006   // Look through COPY instructions to find definition.
8007   while (DefMI->isCopy()) {
8008     Register CopyVReg = DefMI->getOperand(1).getReg();
8009     if (!MRI->hasOneNonDBGUse(CopyVReg))
8010       return false;
8011     if (!MRI->hasOneDef(CopyVReg))
8012       return false;
8013     DefMI = MRI->getVRegDef(CopyVReg);
8014   }
8015 
8016   switch (DefMI->getOpcode()) {
8017   default:
8018     return false;
8019   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8020   case AArch64::ANDWri:
8021   case AArch64::ANDXri: {
8022     if (IsTestAndBranch)
8023       return false;
8024     if (DefMI->getParent() != MBB)
8025       return false;
8026     if (!MRI->hasOneNonDBGUse(VReg))
8027       return false;
8028 
8029     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8030     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8031         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8032     if (!isPowerOf2_64(Mask))
8033       return false;
8034 
8035     MachineOperand &MO = DefMI->getOperand(1);
8036     Register NewReg = MO.getReg();
8037     if (!NewReg.isVirtual())
8038       return false;
8039 
8040     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8041 
8042     MachineBasicBlock &RefToMBB = *MBB;
8043     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8044     DebugLoc DL = MI.getDebugLoc();
8045     unsigned Imm = Log2_64(Mask);
8046     unsigned Opc = (Imm < 32)
8047                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8048                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8049     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8050                               .addReg(NewReg)
8051                               .addImm(Imm)
8052                               .addMBB(TBB);
8053     // Register lives on to the CBZ now.
8054     MO.setIsKill(false);
8055 
8056     // For immediate smaller than 32, we need to use the 32-bit
8057     // variant (W) in all cases. Indeed the 64-bit variant does not
8058     // allow to encode them.
8059     // Therefore, if the input register is 64-bit, we need to take the
8060     // 32-bit sub-part.
8061     if (!Is32Bit && Imm < 32)
8062       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8063     MI.eraseFromParent();
8064     return true;
8065   }
8066   // Look for CSINC
8067   case AArch64::CSINCWr:
8068   case AArch64::CSINCXr: {
8069     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8070           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8071         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8072           DefMI->getOperand(2).getReg() == AArch64::XZR))
8073       return false;
8074 
8075     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8076                                          true) != -1)
8077       return false;
8078 
8079     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8080     // Convert only when the condition code is not modified between
8081     // the CSINC and the branch. The CC may be used by other
8082     // instructions in between.
8083     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8084       return false;
8085     MachineBasicBlock &RefToMBB = *MBB;
8086     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8087     DebugLoc DL = MI.getDebugLoc();
8088     if (IsNegativeBranch)
8089       CC = AArch64CC::getInvertedCondCode(CC);
8090     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8091     MI.eraseFromParent();
8092     return true;
8093   }
8094   }
8095 }
8096 
8097 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8098 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8099   const unsigned Mask = AArch64II::MO_FRAGMENT;
8100   return std::make_pair(TF & Mask, TF & ~Mask);
8101 }
8102 
8103 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8104 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8105   using namespace AArch64II;
8106 
8107   static const std::pair<unsigned, const char *> TargetFlags[] = {
8108       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8109       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8110       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8111       {MO_HI12, "aarch64-hi12"}};
8112   return ArrayRef(TargetFlags);
8113 }
8114 
8115 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8116 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8117   using namespace AArch64II;
8118 
8119   static const std::pair<unsigned, const char *> TargetFlags[] = {
8120       {MO_COFFSTUB, "aarch64-coffstub"},
8121       {MO_GOT, "aarch64-got"},
8122       {MO_NC, "aarch64-nc"},
8123       {MO_S, "aarch64-s"},
8124       {MO_TLS, "aarch64-tls"},
8125       {MO_DLLIMPORT, "aarch64-dllimport"},
8126       {MO_PREL, "aarch64-prel"},
8127       {MO_TAGGED, "aarch64-tagged"},
8128       {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8129   };
8130   return ArrayRef(TargetFlags);
8131 }
8132 
8133 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8134 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8135   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8136       {{MOSuppressPair, "aarch64-suppress-pair"},
8137        {MOStridedAccess, "aarch64-strided-access"}};
8138   return ArrayRef(TargetFlags);
8139 }
8140 
8141 /// Constants defining how certain sequences should be outlined.
8142 /// This encompasses how an outlined function should be called, and what kind of
8143 /// frame should be emitted for that outlined function.
8144 ///
8145 /// \p MachineOutlinerDefault implies that the function should be called with
8146 /// a save and restore of LR to the stack.
8147 ///
8148 /// That is,
8149 ///
8150 /// I1     Save LR                    OUTLINED_FUNCTION:
8151 /// I2 --> BL OUTLINED_FUNCTION       I1
8152 /// I3     Restore LR                 I2
8153 ///                                   I3
8154 ///                                   RET
8155 ///
8156 /// * Call construction overhead: 3 (save + BL + restore)
8157 /// * Frame construction overhead: 1 (ret)
8158 /// * Requires stack fixups? Yes
8159 ///
8160 /// \p MachineOutlinerTailCall implies that the function is being created from
8161 /// a sequence of instructions ending in a return.
8162 ///
8163 /// That is,
8164 ///
8165 /// I1                             OUTLINED_FUNCTION:
8166 /// I2 --> B OUTLINED_FUNCTION     I1
8167 /// RET                            I2
8168 ///                                RET
8169 ///
8170 /// * Call construction overhead: 1 (B)
8171 /// * Frame construction overhead: 0 (Return included in sequence)
8172 /// * Requires stack fixups? No
8173 ///
8174 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8175 /// a BL instruction, but doesn't require LR to be saved and restored. This
8176 /// happens when LR is known to be dead.
8177 ///
8178 /// That is,
8179 ///
8180 /// I1                                OUTLINED_FUNCTION:
8181 /// I2 --> BL OUTLINED_FUNCTION       I1
8182 /// I3                                I2
8183 ///                                   I3
8184 ///                                   RET
8185 ///
8186 /// * Call construction overhead: 1 (BL)
8187 /// * Frame construction overhead: 1 (RET)
8188 /// * Requires stack fixups? No
8189 ///
8190 /// \p MachineOutlinerThunk implies that the function is being created from
8191 /// a sequence of instructions ending in a call. The outlined function is
8192 /// called with a BL instruction, and the outlined function tail-calls the
8193 /// original call destination.
8194 ///
8195 /// That is,
8196 ///
8197 /// I1                                OUTLINED_FUNCTION:
8198 /// I2 --> BL OUTLINED_FUNCTION       I1
8199 /// BL f                              I2
8200 ///                                   B f
8201 /// * Call construction overhead: 1 (BL)
8202 /// * Frame construction overhead: 0
8203 /// * Requires stack fixups? No
8204 ///
8205 /// \p MachineOutlinerRegSave implies that the function should be called with a
8206 /// save and restore of LR to an available register. This allows us to avoid
8207 /// stack fixups. Note that this outlining variant is compatible with the
8208 /// NoLRSave case.
8209 ///
8210 /// That is,
8211 ///
8212 /// I1     Save LR                    OUTLINED_FUNCTION:
8213 /// I2 --> BL OUTLINED_FUNCTION       I1
8214 /// I3     Restore LR                 I2
8215 ///                                   I3
8216 ///                                   RET
8217 ///
8218 /// * Call construction overhead: 3 (save + BL + restore)
8219 /// * Frame construction overhead: 1 (ret)
8220 /// * Requires stack fixups? No
8221 enum MachineOutlinerClass {
8222   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
8223   MachineOutlinerTailCall, /// Only emit a branch.
8224   MachineOutlinerNoLRSave, /// Emit a call and return.
8225   MachineOutlinerThunk,    /// Emit a call and tail-call.
8226   MachineOutlinerRegSave   /// Same as default, but save to a register.
8227 };
8228 
8229 enum MachineOutlinerMBBFlags {
8230   LRUnavailableSomewhere = 0x2,
8231   HasCalls = 0x4,
8232   UnsafeRegsDead = 0x8
8233 };
8234 
8235 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const8236 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8237   MachineFunction *MF = C.getMF();
8238   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8239   const AArch64RegisterInfo *ARI =
8240       static_cast<const AArch64RegisterInfo *>(&TRI);
8241   // Check if there is an available register across the sequence that we can
8242   // use.
8243   for (unsigned Reg : AArch64::GPR64RegClass) {
8244     if (!ARI->isReservedReg(*MF, Reg) &&
8245         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
8246         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8247         Reg != AArch64::X17 && // Ditto for X17.
8248         C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8249         C.isAvailableInsideSeq(Reg, TRI))
8250       return Reg;
8251   }
8252   return Register();
8253 }
8254 
8255 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8256 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8257                                          const outliner::Candidate &b) {
8258   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8259   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8260 
8261   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8262          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8263 }
8264 
8265 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8266 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8267                                        const outliner::Candidate &b) {
8268   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8269   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8270 
8271   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8272 }
8273 
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8274 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8275                                                 const outliner::Candidate &b) {
8276   const AArch64Subtarget &SubtargetA =
8277       a.getMF()->getSubtarget<AArch64Subtarget>();
8278   const AArch64Subtarget &SubtargetB =
8279       b.getMF()->getSubtarget<AArch64Subtarget>();
8280   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8281 }
8282 
8283 std::optional<outliner::OutlinedFunction>
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const8284 AArch64InstrInfo::getOutliningCandidateInfo(
8285     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8286   unsigned SequenceSize = 0;
8287   for (auto &MI : RepeatedSequenceLocs[0])
8288     SequenceSize += getInstSizeInBytes(MI);
8289 
8290   unsigned NumBytesToCreateFrame = 0;
8291 
8292   // We only allow outlining for functions having exactly matching return
8293   // address signing attributes, i.e., all share the same value for the
8294   // attribute "sign-return-address" and all share the same type of key they
8295   // are signed with.
8296   // Additionally we require all functions to simultaniously either support
8297   // v8.3a features or not. Otherwise an outlined function could get signed
8298   // using dedicated v8.3 instructions and a call from a function that doesn't
8299   // support v8.3 instructions would therefore be invalid.
8300   if (std::adjacent_find(
8301           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8302           [](const outliner::Candidate &a, const outliner::Candidate &b) {
8303             // Return true if a and b are non-equal w.r.t. return address
8304             // signing or support of v8.3a features
8305             if (outliningCandidatesSigningScopeConsensus(a, b) &&
8306                 outliningCandidatesSigningKeyConsensus(a, b) &&
8307                 outliningCandidatesV8_3OpsConsensus(a, b)) {
8308               return false;
8309             }
8310             return true;
8311           }) != RepeatedSequenceLocs.end()) {
8312     return std::nullopt;
8313   }
8314 
8315   // Since at this point all candidates agree on their return address signing
8316   // picking just one is fine. If the candidate functions potentially sign their
8317   // return addresses, the outlined function should do the same. Note that in
8318   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8319   // not certainly true that the outlined function will have to sign its return
8320   // address but this decision is made later, when the decision to outline
8321   // has already been made.
8322   // The same holds for the number of additional instructions we need: On
8323   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8324   // necessary. However, at this point we don't know if the outlined function
8325   // will have a RET instruction so we assume the worst.
8326   const TargetRegisterInfo &TRI = getRegisterInfo();
8327   // Performing a tail call may require extra checks when PAuth is enabled.
8328   // If PAuth is disabled, set it to zero for uniformity.
8329   unsigned NumBytesToCheckLRInTCEpilogue = 0;
8330   if (RepeatedSequenceLocs[0]
8331           .getMF()
8332           ->getInfo<AArch64FunctionInfo>()
8333           ->shouldSignReturnAddress(true)) {
8334     // One PAC and one AUT instructions
8335     NumBytesToCreateFrame += 8;
8336 
8337     // PAuth is enabled - set extra tail call cost, if any.
8338     auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8339         *RepeatedSequenceLocs[0].getMF());
8340     NumBytesToCheckLRInTCEpilogue =
8341         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8342     // Checking the authenticated LR value may significantly impact
8343     // SequenceSize, so account for it for more precise results.
8344     if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8345       SequenceSize += NumBytesToCheckLRInTCEpilogue;
8346 
8347     // We have to check if sp modifying instructions would get outlined.
8348     // If so we only allow outlining if sp is unchanged overall, so matching
8349     // sub and add instructions are okay to outline, all other sp modifications
8350     // are not
8351     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8352       int SPValue = 0;
8353       for (auto &MI : C) {
8354         if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8355           switch (MI.getOpcode()) {
8356           case AArch64::ADDXri:
8357           case AArch64::ADDWri:
8358             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8359             assert(MI.getOperand(2).isImm() &&
8360                    "Expected operand to be immediate");
8361             assert(MI.getOperand(1).isReg() &&
8362                    "Expected operand to be a register");
8363             // Check if the add just increments sp. If so, we search for
8364             // matching sub instructions that decrement sp. If not, the
8365             // modification is illegal
8366             if (MI.getOperand(1).getReg() == AArch64::SP)
8367               SPValue += MI.getOperand(2).getImm();
8368             else
8369               return true;
8370             break;
8371           case AArch64::SUBXri:
8372           case AArch64::SUBWri:
8373             assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8374             assert(MI.getOperand(2).isImm() &&
8375                    "Expected operand to be immediate");
8376             assert(MI.getOperand(1).isReg() &&
8377                    "Expected operand to be a register");
8378             // Check if the sub just decrements sp. If so, we search for
8379             // matching add instructions that increment sp. If not, the
8380             // modification is illegal
8381             if (MI.getOperand(1).getReg() == AArch64::SP)
8382               SPValue -= MI.getOperand(2).getImm();
8383             else
8384               return true;
8385             break;
8386           default:
8387             return true;
8388           }
8389         }
8390       }
8391       if (SPValue)
8392         return true;
8393       return false;
8394     };
8395     // Remove candidates with illegal stack modifying instructions
8396     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8397 
8398     // If the sequence doesn't have enough candidates left, then we're done.
8399     if (RepeatedSequenceLocs.size() < 2)
8400       return std::nullopt;
8401   }
8402 
8403   // Properties about candidate MBBs that hold for all of them.
8404   unsigned FlagsSetInAll = 0xF;
8405 
8406   // Compute liveness information for each candidate, and set FlagsSetInAll.
8407   for (outliner::Candidate &C : RepeatedSequenceLocs)
8408     FlagsSetInAll &= C.Flags;
8409 
8410   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8411 
8412   // Helper lambda which sets call information for every candidate.
8413   auto SetCandidateCallInfo =
8414       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8415         for (outliner::Candidate &C : RepeatedSequenceLocs)
8416           C.setCallInfo(CallID, NumBytesForCall);
8417       };
8418 
8419   unsigned FrameID = MachineOutlinerDefault;
8420   NumBytesToCreateFrame += 4;
8421 
8422   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8423     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8424   });
8425 
8426   // We check to see if CFI Instructions are present, and if they are
8427   // we find the number of CFI Instructions in the candidates.
8428   unsigned CFICount = 0;
8429   for (auto &I : RepeatedSequenceLocs[0]) {
8430     if (I.isCFIInstruction())
8431       CFICount++;
8432   }
8433 
8434   // We compare the number of found CFI Instructions to  the number of CFI
8435   // instructions in the parent function for each candidate.  We must check this
8436   // since if we outline one of the CFI instructions in a function, we have to
8437   // outline them all for correctness. If we do not, the address offsets will be
8438   // incorrect between the two sections of the program.
8439   for (outliner::Candidate &C : RepeatedSequenceLocs) {
8440     std::vector<MCCFIInstruction> CFIInstructions =
8441         C.getMF()->getFrameInstructions();
8442 
8443     if (CFICount > 0 && CFICount != CFIInstructions.size())
8444       return std::nullopt;
8445   }
8446 
8447   // Returns true if an instructions is safe to fix up, false otherwise.
8448   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8449     if (MI.isCall())
8450       return true;
8451 
8452     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8453         !MI.readsRegister(AArch64::SP, &TRI))
8454       return true;
8455 
8456     // Any modification of SP will break our code to save/restore LR.
8457     // FIXME: We could handle some instructions which add a constant
8458     // offset to SP, with a bit more work.
8459     if (MI.modifiesRegister(AArch64::SP, &TRI))
8460       return false;
8461 
8462     // At this point, we have a stack instruction that we might need to
8463     // fix up. We'll handle it if it's a load or store.
8464     if (MI.mayLoadOrStore()) {
8465       const MachineOperand *Base; // Filled with the base operand of MI.
8466       int64_t Offset;             // Filled with the offset of MI.
8467       bool OffsetIsScalable;
8468 
8469       // Does it allow us to offset the base operand and is the base the
8470       // register SP?
8471       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8472           !Base->isReg() || Base->getReg() != AArch64::SP)
8473         return false;
8474 
8475       // Fixe-up code below assumes bytes.
8476       if (OffsetIsScalable)
8477         return false;
8478 
8479       // Find the minimum/maximum offset for this instruction and check
8480       // if fixing it up would be in range.
8481       int64_t MinOffset,
8482           MaxOffset;  // Unscaled offsets for the instruction.
8483       // The scale to multiply the offsets by.
8484       TypeSize Scale(0U, false), DummyWidth(0U, false);
8485       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8486 
8487       Offset += 16; // Update the offset to what it would be if we outlined.
8488       if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8489           Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8490         return false;
8491 
8492       // It's in range, so we can outline it.
8493       return true;
8494     }
8495 
8496     // FIXME: Add handling for instructions like "add x0, sp, #8".
8497 
8498     // We can't fix it up, so don't outline it.
8499     return false;
8500   };
8501 
8502   // True if it's possible to fix up each stack instruction in this sequence.
8503   // Important for frames/call variants that modify the stack.
8504   bool AllStackInstrsSafe =
8505       llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8506 
8507   // If the last instruction in any candidate is a terminator, then we should
8508   // tail call all of the candidates.
8509   if (RepeatedSequenceLocs[0].back().isTerminator()) {
8510     FrameID = MachineOutlinerTailCall;
8511     NumBytesToCreateFrame = 0;
8512     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8513     SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8514   }
8515 
8516   else if (LastInstrOpcode == AArch64::BL ||
8517            ((LastInstrOpcode == AArch64::BLR ||
8518              LastInstrOpcode == AArch64::BLRNoIP) &&
8519             !HasBTI)) {
8520     // FIXME: Do we need to check if the code after this uses the value of LR?
8521     FrameID = MachineOutlinerThunk;
8522     NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8523     SetCandidateCallInfo(MachineOutlinerThunk, 4);
8524   }
8525 
8526   else {
8527     // We need to decide how to emit calls + frames. We can always emit the same
8528     // frame if we don't need to save to the stack. If we have to save to the
8529     // stack, then we need a different frame.
8530     unsigned NumBytesNoStackCalls = 0;
8531     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8532 
8533     // Check if we have to save LR.
8534     for (outliner::Candidate &C : RepeatedSequenceLocs) {
8535       bool LRAvailable =
8536           (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8537               ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8538               : true;
8539       // If we have a noreturn caller, then we're going to be conservative and
8540       // say that we have to save LR. If we don't have a ret at the end of the
8541       // block, then we can't reason about liveness accurately.
8542       //
8543       // FIXME: We can probably do better than always disabling this in
8544       // noreturn functions by fixing up the liveness info.
8545       bool IsNoReturn =
8546           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8547 
8548       // Is LR available? If so, we don't need a save.
8549       if (LRAvailable && !IsNoReturn) {
8550         NumBytesNoStackCalls += 4;
8551         C.setCallInfo(MachineOutlinerNoLRSave, 4);
8552         CandidatesWithoutStackFixups.push_back(C);
8553       }
8554 
8555       // Is an unused register available? If so, we won't modify the stack, so
8556       // we can outline with the same frame type as those that don't save LR.
8557       else if (findRegisterToSaveLRTo(C)) {
8558         NumBytesNoStackCalls += 12;
8559         C.setCallInfo(MachineOutlinerRegSave, 12);
8560         CandidatesWithoutStackFixups.push_back(C);
8561       }
8562 
8563       // Is SP used in the sequence at all? If not, we don't have to modify
8564       // the stack, so we are guaranteed to get the same frame.
8565       else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8566         NumBytesNoStackCalls += 12;
8567         C.setCallInfo(MachineOutlinerDefault, 12);
8568         CandidatesWithoutStackFixups.push_back(C);
8569       }
8570 
8571       // If we outline this, we need to modify the stack. Pretend we don't
8572       // outline this by saving all of its bytes.
8573       else {
8574         NumBytesNoStackCalls += SequenceSize;
8575       }
8576     }
8577 
8578     // If there are no places where we have to save LR, then note that we
8579     // don't have to update the stack. Otherwise, give every candidate the
8580     // default call type, as long as it's safe to do so.
8581     if (!AllStackInstrsSafe ||
8582         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8583       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8584       FrameID = MachineOutlinerNoLRSave;
8585       if (RepeatedSequenceLocs.size() < 2)
8586         return std::nullopt;
8587     } else {
8588       SetCandidateCallInfo(MachineOutlinerDefault, 12);
8589 
8590       // Bugzilla ID: 46767
8591       // TODO: Check if fixing up the stack more than once is safe so we can
8592       // outline these.
8593       //
8594       // An outline resulting in a caller that requires stack fixups at the
8595       // callsite to a callee that also requires stack fixups can happen when
8596       // there are no available registers at the candidate callsite for a
8597       // candidate that itself also has calls.
8598       //
8599       // In other words if function_containing_sequence in the following pseudo
8600       // assembly requires that we save LR at the point of the call, but there
8601       // are no available registers: in this case we save using SP and as a
8602       // result the SP offsets requires stack fixups by multiples of 16.
8603       //
8604       // function_containing_sequence:
8605       //   ...
8606       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8607       //   call OUTLINED_FUNCTION_N
8608       //   restore LR from SP
8609       //   ...
8610       //
8611       // OUTLINED_FUNCTION_N:
8612       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8613       //   ...
8614       //   bl foo
8615       //   restore LR from SP
8616       //   ret
8617       //
8618       // Because the code to handle more than one stack fixup does not
8619       // currently have the proper checks for legality, these cases will assert
8620       // in the AArch64 MachineOutliner. This is because the code to do this
8621       // needs more hardening, testing, better checks that generated code is
8622       // legal, etc and because it is only verified to handle a single pass of
8623       // stack fixup.
8624       //
8625       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8626       // these cases until they are known to be handled. Bugzilla 46767 is
8627       // referenced in comments at the assert site.
8628       //
8629       // To avoid asserting (or generating non-legal code on noassert builds)
8630       // we remove all candidates which would need more than one stack fixup by
8631       // pruning the cases where the candidate has calls while also having no
8632       // available LR and having no available general purpose registers to copy
8633       // LR to (ie one extra stack save/restore).
8634       //
8635       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8636         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8637           auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8638           return (llvm::any_of(C, IsCall)) &&
8639                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8640                   !findRegisterToSaveLRTo(C));
8641         });
8642       }
8643     }
8644 
8645     // If we dropped all of the candidates, bail out here.
8646     if (RepeatedSequenceLocs.size() < 2) {
8647       RepeatedSequenceLocs.clear();
8648       return std::nullopt;
8649     }
8650   }
8651 
8652   // Does every candidate's MBB contain a call? If so, then we might have a call
8653   // in the range.
8654   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8655     // Check if the range contains a call. These require a save + restore of the
8656     // link register.
8657     outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8658     bool ModStackToSaveLR = false;
8659     if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8660                     [](const MachineInstr &MI) { return MI.isCall(); }))
8661       ModStackToSaveLR = true;
8662 
8663     // Handle the last instruction separately. If this is a tail call, then the
8664     // last instruction is a call. We don't want to save + restore in this case.
8665     // However, it could be possible that the last instruction is a call without
8666     // it being valid to tail call this sequence. We should consider this as
8667     // well.
8668     else if (FrameID != MachineOutlinerThunk &&
8669              FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8670       ModStackToSaveLR = true;
8671 
8672     if (ModStackToSaveLR) {
8673       // We can't fix up the stack. Bail out.
8674       if (!AllStackInstrsSafe) {
8675         RepeatedSequenceLocs.clear();
8676         return std::nullopt;
8677       }
8678 
8679       // Save + restore LR.
8680       NumBytesToCreateFrame += 8;
8681     }
8682   }
8683 
8684   // If we have CFI instructions, we can only outline if the outlined section
8685   // can be a tail call
8686   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8687     return std::nullopt;
8688 
8689   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8690                                     NumBytesToCreateFrame, FrameID);
8691 }
8692 
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const8693 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8694     Function &F, std::vector<outliner::Candidate> &Candidates) const {
8695   // If a bunch of candidates reach this point they must agree on their return
8696   // address signing. It is therefore enough to just consider the signing
8697   // behaviour of one of them
8698   const auto &CFn = Candidates.front().getMF()->getFunction();
8699 
8700   if (CFn.hasFnAttribute("ptrauth-returns"))
8701     F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
8702   if (CFn.hasFnAttribute("ptrauth-auth-traps"))
8703     F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
8704   // Since all candidates belong to the same module, just copy the
8705   // function-level attributes of an arbitrary function.
8706   if (CFn.hasFnAttribute("sign-return-address"))
8707     F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8708   if (CFn.hasFnAttribute("sign-return-address-key"))
8709     F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8710 
8711   AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8712 }
8713 
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const8714 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8715     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8716   const Function &F = MF.getFunction();
8717 
8718   // Can F be deduplicated by the linker? If it can, don't outline from it.
8719   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8720     return false;
8721 
8722   // Don't outline from functions with section markings; the program could
8723   // expect that all the code is in the named section.
8724   // FIXME: Allow outlining from multiple functions with the same section
8725   // marking.
8726   if (F.hasSection())
8727     return false;
8728 
8729   // Outlining from functions with redzones is unsafe since the outliner may
8730   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8731   // outline from it.
8732   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8733   if (!AFI || AFI->hasRedZone().value_or(true))
8734     return false;
8735 
8736   // FIXME: Determine whether it is safe to outline from functions which contain
8737   // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
8738   // outlined together and ensure it is safe to outline with async unwind info,
8739   // required for saving & restoring VG around calls.
8740   if (AFI->hasStreamingModeChanges())
8741     return false;
8742 
8743   // FIXME: Teach the outliner to generate/handle Windows unwind info.
8744   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8745     return false;
8746 
8747   // It's safe to outline from MF.
8748   return true;
8749 }
8750 
8751 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const8752 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8753                                       unsigned &Flags) const {
8754   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8755          "Must track liveness!");
8756   SmallVector<
8757       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8758       Ranges;
8759   // According to the AArch64 Procedure Call Standard, the following are
8760   // undefined on entry/exit from a function call:
8761   //
8762   // * Registers x16, x17, (and thus w16, w17)
8763   // * Condition codes (and thus the NZCV register)
8764   //
8765   // If any of these registers are used inside or live across an outlined
8766   // function, then they may be modified later, either by the compiler or
8767   // some other tool (like the linker).
8768   //
8769   // To avoid outlining in these situations, partition each block into ranges
8770   // where these registers are dead. We will only outline from those ranges.
8771   LiveRegUnits LRU(getRegisterInfo());
8772   auto AreAllUnsafeRegsDead = [&LRU]() {
8773     return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8774            LRU.available(AArch64::NZCV);
8775   };
8776 
8777   // We need to know if LR is live across an outlining boundary later on in
8778   // order to decide how we'll create the outlined call, frame, etc.
8779   //
8780   // It's pretty expensive to check this for *every candidate* within a block.
8781   // That's some potentially n^2 behaviour, since in the worst case, we'd need
8782   // to compute liveness from the end of the block for O(n) candidates within
8783   // the block.
8784   //
8785   // So, to improve the average case, let's keep track of liveness from the end
8786   // of the block to the beginning of *every outlinable range*. If we know that
8787   // LR is available in every range we could outline from, then we know that
8788   // we don't need to check liveness for any candidate within that range.
8789   bool LRAvailableEverywhere = true;
8790   // Compute liveness bottom-up.
8791   LRU.addLiveOuts(MBB);
8792   // Update flags that require info about the entire MBB.
8793   auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8794     if (MI.isCall() && !MI.isTerminator())
8795       Flags |= MachineOutlinerMBBFlags::HasCalls;
8796   };
8797   // Range: [RangeBegin, RangeEnd)
8798   MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8799   unsigned RangeLen;
8800   auto CreateNewRangeStartingAt =
8801       [&RangeBegin, &RangeEnd,
8802        &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8803         RangeBegin = NewBegin;
8804         RangeEnd = std::next(RangeBegin);
8805         RangeLen = 0;
8806       };
8807   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8808     // At least one unsafe register is not dead. We do not want to outline at
8809     // this point. If it is long enough to outline from, save the range
8810     // [RangeBegin, RangeEnd).
8811     if (RangeLen > 1)
8812       Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8813   };
8814   // Find the first point where all unsafe registers are dead.
8815   // FIND: <safe instr> <-- end of first potential range
8816   // SKIP: <unsafe def>
8817   // SKIP: ... everything between ...
8818   // SKIP: <unsafe use>
8819   auto FirstPossibleEndPt = MBB.instr_rbegin();
8820   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8821     LRU.stepBackward(*FirstPossibleEndPt);
8822     // Update flags that impact how we outline across the entire block,
8823     // regardless of safety.
8824     UpdateWholeMBBFlags(*FirstPossibleEndPt);
8825     if (AreAllUnsafeRegsDead())
8826       break;
8827   }
8828   // If we exhausted the entire block, we have no safe ranges to outline.
8829   if (FirstPossibleEndPt == MBB.instr_rend())
8830     return Ranges;
8831   // Current range.
8832   CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8833   // StartPt points to the first place where all unsafe registers
8834   // are dead (if there is any such point). Begin partitioning the MBB into
8835   // ranges.
8836   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8837     LRU.stepBackward(MI);
8838     UpdateWholeMBBFlags(MI);
8839     if (!AreAllUnsafeRegsDead()) {
8840       SaveRangeIfNonEmpty();
8841       CreateNewRangeStartingAt(MI.getIterator());
8842       continue;
8843     }
8844     LRAvailableEverywhere &= LRU.available(AArch64::LR);
8845     RangeBegin = MI.getIterator();
8846     ++RangeLen;
8847   }
8848   // Above loop misses the last (or only) range. If we are still safe, then
8849   // let's save the range.
8850   if (AreAllUnsafeRegsDead())
8851     SaveRangeIfNonEmpty();
8852   if (Ranges.empty())
8853     return Ranges;
8854   // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8855   // the order.
8856   std::reverse(Ranges.begin(), Ranges.end());
8857   // If there is at least one outlinable range where LR is unavailable
8858   // somewhere, remember that.
8859   if (!LRAvailableEverywhere)
8860     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8861   return Ranges;
8862 }
8863 
8864 outliner::InstrType
getOutliningTypeImpl(MachineBasicBlock::iterator & MIT,unsigned Flags) const8865 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8866                                    unsigned Flags) const {
8867   MachineInstr &MI = *MIT;
8868   MachineBasicBlock *MBB = MI.getParent();
8869   MachineFunction *MF = MBB->getParent();
8870   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8871 
8872   // Don't outline anything used for return address signing. The outlined
8873   // function will get signed later if needed
8874   switch (MI.getOpcode()) {
8875   case AArch64::PACM:
8876   case AArch64::PACIASP:
8877   case AArch64::PACIBSP:
8878   case AArch64::PACIASPPC:
8879   case AArch64::PACIBSPPC:
8880   case AArch64::AUTIASP:
8881   case AArch64::AUTIBSP:
8882   case AArch64::AUTIASPPCi:
8883   case AArch64::AUTIASPPCr:
8884   case AArch64::AUTIBSPPCi:
8885   case AArch64::AUTIBSPPCr:
8886   case AArch64::RETAA:
8887   case AArch64::RETAB:
8888   case AArch64::RETAASPPCi:
8889   case AArch64::RETAASPPCr:
8890   case AArch64::RETABSPPCi:
8891   case AArch64::RETABSPPCr:
8892   case AArch64::EMITBKEY:
8893   case AArch64::PAUTH_PROLOGUE:
8894   case AArch64::PAUTH_EPILOGUE:
8895     return outliner::InstrType::Illegal;
8896   }
8897 
8898   // Don't outline LOHs.
8899   if (FuncInfo->getLOHRelated().count(&MI))
8900     return outliner::InstrType::Illegal;
8901 
8902   // We can only outline these if we will tail call the outlined function, or
8903   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8904   // in a tail call.
8905   //
8906   // FIXME: If the proper fixups for the offset are implemented, this should be
8907   // possible.
8908   if (MI.isCFIInstruction())
8909     return outliner::InstrType::Legal;
8910 
8911   // Is this a terminator for a basic block?
8912   if (MI.isTerminator())
8913     // TargetInstrInfo::getOutliningType has already filtered out anything
8914     // that would break this, so we can allow it here.
8915     return outliner::InstrType::Legal;
8916 
8917   // Make sure none of the operands are un-outlinable.
8918   for (const MachineOperand &MOP : MI.operands()) {
8919     // A check preventing CFI indices was here before, but only CFI
8920     // instructions should have those.
8921     assert(!MOP.isCFIIndex());
8922 
8923     // If it uses LR or W30 explicitly, then don't touch it.
8924     if (MOP.isReg() && !MOP.isImplicit() &&
8925         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8926       return outliner::InstrType::Illegal;
8927   }
8928 
8929   // Special cases for instructions that can always be outlined, but will fail
8930   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8931   // be outlined because they don't require a *specific* value to be in LR.
8932   if (MI.getOpcode() == AArch64::ADRP)
8933     return outliner::InstrType::Legal;
8934 
8935   // If MI is a call we might be able to outline it. We don't want to outline
8936   // any calls that rely on the position of items on the stack. When we outline
8937   // something containing a call, we have to emit a save and restore of LR in
8938   // the outlined function. Currently, this always happens by saving LR to the
8939   // stack. Thus, if we outline, say, half the parameters for a function call
8940   // plus the call, then we'll break the callee's expectations for the layout
8941   // of the stack.
8942   //
8943   // FIXME: Allow calls to functions which construct a stack frame, as long
8944   // as they don't access arguments on the stack.
8945   // FIXME: Figure out some way to analyze functions defined in other modules.
8946   // We should be able to compute the memory usage based on the IR calling
8947   // convention, even if we can't see the definition.
8948   if (MI.isCall()) {
8949     // Get the function associated with the call. Look at each operand and find
8950     // the one that represents the callee and get its name.
8951     const Function *Callee = nullptr;
8952     for (const MachineOperand &MOP : MI.operands()) {
8953       if (MOP.isGlobal()) {
8954         Callee = dyn_cast<Function>(MOP.getGlobal());
8955         break;
8956       }
8957     }
8958 
8959     // Never outline calls to mcount.  There isn't any rule that would require
8960     // this, but the Linux kernel's "ftrace" feature depends on it.
8961     if (Callee && Callee->getName() == "\01_mcount")
8962       return outliner::InstrType::Illegal;
8963 
8964     // If we don't know anything about the callee, assume it depends on the
8965     // stack layout of the caller. In that case, it's only legal to outline
8966     // as a tail-call. Explicitly list the call instructions we know about so we
8967     // don't get unexpected results with call pseudo-instructions.
8968     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8969     if (MI.getOpcode() == AArch64::BLR ||
8970         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8971       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8972 
8973     if (!Callee)
8974       return UnknownCallOutlineType;
8975 
8976     // We have a function we have information about. Check it if it's something
8977     // can safely outline.
8978     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8979 
8980     // We don't know what's going on with the callee at all. Don't touch it.
8981     if (!CalleeMF)
8982       return UnknownCallOutlineType;
8983 
8984     // Check if we know anything about the callee saves on the function. If we
8985     // don't, then don't touch it, since that implies that we haven't
8986     // computed anything about its stack frame yet.
8987     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8988     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8989         MFI.getNumObjects() > 0)
8990       return UnknownCallOutlineType;
8991 
8992     // At this point, we can say that CalleeMF ought to not pass anything on the
8993     // stack. Therefore, we can outline it.
8994     return outliner::InstrType::Legal;
8995   }
8996 
8997   // Don't touch the link register or W30.
8998   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8999       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9000     return outliner::InstrType::Illegal;
9001 
9002   // Don't outline BTI instructions, because that will prevent the outlining
9003   // site from being indirectly callable.
9004   if (hasBTISemantics(MI))
9005     return outliner::InstrType::Illegal;
9006 
9007   return outliner::InstrType::Legal;
9008 }
9009 
fixupPostOutline(MachineBasicBlock & MBB) const9010 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9011   for (MachineInstr &MI : MBB) {
9012     const MachineOperand *Base;
9013     TypeSize Width(0, false);
9014     int64_t Offset;
9015     bool OffsetIsScalable;
9016 
9017     // Is this a load or store with an immediate offset with SP as the base?
9018     if (!MI.mayLoadOrStore() ||
9019         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9020                                       &RI) ||
9021         (Base->isReg() && Base->getReg() != AArch64::SP))
9022       continue;
9023 
9024     // It is, so we have to fix it up.
9025     TypeSize Scale(0U, false);
9026     int64_t Dummy1, Dummy2;
9027 
9028     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9029     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9030     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9031     assert(Scale != 0 && "Unexpected opcode!");
9032     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9033 
9034     // We've pushed the return address to the stack, so add 16 to the offset.
9035     // This is safe, since we already checked if it would overflow when we
9036     // checked if this instruction was legal to outline.
9037     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9038     StackOffsetOperand.setImm(NewImm);
9039   }
9040 }
9041 
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)9042 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9043                                  const AArch64InstrInfo *TII,
9044                                  bool ShouldSignReturnAddr) {
9045   if (!ShouldSignReturnAddr)
9046     return;
9047 
9048   BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9049       .setMIFlag(MachineInstr::FrameSetup);
9050   BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9051           TII->get(AArch64::PAUTH_EPILOGUE))
9052       .setMIFlag(MachineInstr::FrameDestroy);
9053 }
9054 
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const9055 void AArch64InstrInfo::buildOutlinedFrame(
9056     MachineBasicBlock &MBB, MachineFunction &MF,
9057     const outliner::OutlinedFunction &OF) const {
9058 
9059   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9060 
9061   if (OF.FrameConstructionID == MachineOutlinerTailCall)
9062     FI->setOutliningStyle("Tail Call");
9063   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9064     // For thunk outlining, rewrite the last instruction from a call to a
9065     // tail-call.
9066     MachineInstr *Call = &*--MBB.instr_end();
9067     unsigned TailOpcode;
9068     if (Call->getOpcode() == AArch64::BL) {
9069       TailOpcode = AArch64::TCRETURNdi;
9070     } else {
9071       assert(Call->getOpcode() == AArch64::BLR ||
9072              Call->getOpcode() == AArch64::BLRNoIP);
9073       TailOpcode = AArch64::TCRETURNriALL;
9074     }
9075     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9076                            .add(Call->getOperand(0))
9077                            .addImm(0);
9078     MBB.insert(MBB.end(), TC);
9079     Call->eraseFromParent();
9080 
9081     FI->setOutliningStyle("Thunk");
9082   }
9083 
9084   bool IsLeafFunction = true;
9085 
9086   // Is there a call in the outlined range?
9087   auto IsNonTailCall = [](const MachineInstr &MI) {
9088     return MI.isCall() && !MI.isReturn();
9089   };
9090 
9091   if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9092     // Fix up the instructions in the range, since we're going to modify the
9093     // stack.
9094 
9095     // Bugzilla ID: 46767
9096     // TODO: Check if fixing up twice is safe so we can outline these.
9097     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9098            "Can only fix up stack references once");
9099     fixupPostOutline(MBB);
9100 
9101     IsLeafFunction = false;
9102 
9103     // LR has to be a live in so that we can save it.
9104     if (!MBB.isLiveIn(AArch64::LR))
9105       MBB.addLiveIn(AArch64::LR);
9106 
9107     MachineBasicBlock::iterator It = MBB.begin();
9108     MachineBasicBlock::iterator Et = MBB.end();
9109 
9110     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9111         OF.FrameConstructionID == MachineOutlinerThunk)
9112       Et = std::prev(MBB.end());
9113 
9114     // Insert a save before the outlined region
9115     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9116                                 .addReg(AArch64::SP, RegState::Define)
9117                                 .addReg(AArch64::LR)
9118                                 .addReg(AArch64::SP)
9119                                 .addImm(-16);
9120     It = MBB.insert(It, STRXpre);
9121 
9122     if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9123       const TargetSubtargetInfo &STI = MF.getSubtarget();
9124       const MCRegisterInfo *MRI = STI.getRegisterInfo();
9125       unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9126 
9127       // Add a CFI saying the stack was moved 16 B down.
9128       int64_t StackPosEntry =
9129           MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9130       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9131           .addCFIIndex(StackPosEntry)
9132           .setMIFlags(MachineInstr::FrameSetup);
9133 
9134       // Add a CFI saying that the LR that we want to find is now 16 B higher
9135       // than before.
9136       int64_t LRPosEntry = MF.addFrameInst(
9137           MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9138       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9139           .addCFIIndex(LRPosEntry)
9140           .setMIFlags(MachineInstr::FrameSetup);
9141     }
9142 
9143     // Insert a restore before the terminator for the function.
9144     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9145                                  .addReg(AArch64::SP, RegState::Define)
9146                                  .addReg(AArch64::LR, RegState::Define)
9147                                  .addReg(AArch64::SP)
9148                                  .addImm(16);
9149     Et = MBB.insert(Et, LDRXpost);
9150   }
9151 
9152   bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9153 
9154   // If this is a tail call outlined function, then there's already a return.
9155   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9156       OF.FrameConstructionID == MachineOutlinerThunk) {
9157     signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9158     return;
9159   }
9160 
9161   // It's not a tail call, so we have to insert the return ourselves.
9162 
9163   // LR has to be a live in so that we can return to it.
9164   if (!MBB.isLiveIn(AArch64::LR))
9165     MBB.addLiveIn(AArch64::LR);
9166 
9167   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9168                           .addReg(AArch64::LR);
9169   MBB.insert(MBB.end(), ret);
9170 
9171   signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9172 
9173   FI->setOutliningStyle("Function");
9174 
9175   // Did we have to modify the stack by saving the link register?
9176   if (OF.FrameConstructionID != MachineOutlinerDefault)
9177     return;
9178 
9179   // We modified the stack.
9180   // Walk over the basic block and fix up all the stack accesses.
9181   fixupPostOutline(MBB);
9182 }
9183 
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9184 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9185     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9186     MachineFunction &MF, outliner::Candidate &C) const {
9187 
9188   // Are we tail calling?
9189   if (C.CallConstructionID == MachineOutlinerTailCall) {
9190     // If yes, then we can just branch to the label.
9191     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9192                             .addGlobalAddress(M.getNamedValue(MF.getName()))
9193                             .addImm(0));
9194     return It;
9195   }
9196 
9197   // Are we saving the link register?
9198   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9199       C.CallConstructionID == MachineOutlinerThunk) {
9200     // No, so just insert the call.
9201     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9202                             .addGlobalAddress(M.getNamedValue(MF.getName())));
9203     return It;
9204   }
9205 
9206   // We want to return the spot where we inserted the call.
9207   MachineBasicBlock::iterator CallPt;
9208 
9209   // Instructions for saving and restoring LR around the call instruction we're
9210   // going to insert.
9211   MachineInstr *Save;
9212   MachineInstr *Restore;
9213   // Can we save to a register?
9214   if (C.CallConstructionID == MachineOutlinerRegSave) {
9215     // FIXME: This logic should be sunk into a target-specific interface so that
9216     // we don't have to recompute the register.
9217     Register Reg = findRegisterToSaveLRTo(C);
9218     assert(Reg && "No callee-saved register available?");
9219 
9220     // LR has to be a live in so that we can save it.
9221     if (!MBB.isLiveIn(AArch64::LR))
9222       MBB.addLiveIn(AArch64::LR);
9223 
9224     // Save and restore LR from Reg.
9225     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9226                .addReg(AArch64::XZR)
9227                .addReg(AArch64::LR)
9228                .addImm(0);
9229     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9230                 .addReg(AArch64::XZR)
9231                 .addReg(Reg)
9232                 .addImm(0);
9233   } else {
9234     // We have the default case. Save and restore from SP.
9235     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9236                .addReg(AArch64::SP, RegState::Define)
9237                .addReg(AArch64::LR)
9238                .addReg(AArch64::SP)
9239                .addImm(-16);
9240     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9241                   .addReg(AArch64::SP, RegState::Define)
9242                   .addReg(AArch64::LR, RegState::Define)
9243                   .addReg(AArch64::SP)
9244                   .addImm(16);
9245   }
9246 
9247   It = MBB.insert(It, Save);
9248   It++;
9249 
9250   // Insert the call.
9251   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9252                           .addGlobalAddress(M.getNamedValue(MF.getName())));
9253   CallPt = It;
9254   It++;
9255 
9256   It = MBB.insert(It, Restore);
9257   return CallPt;
9258 }
9259 
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const9260 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9261   MachineFunction &MF) const {
9262   return MF.getFunction().hasMinSize();
9263 }
9264 
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const9265 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9266                                           MachineBasicBlock::iterator Iter,
9267                                           DebugLoc &DL,
9268                                           bool AllowSideEffects) const {
9269   const MachineFunction &MF = *MBB.getParent();
9270   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9271   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9272 
9273   if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9274     BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9275   } else if (STI.hasSVE()) {
9276     BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9277       .addImm(0)
9278       .addImm(0);
9279   } else {
9280     BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9281       .addImm(0);
9282   }
9283 }
9284 
9285 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const9286 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9287 
9288   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9289   // and zero immediate operands used as an alias for mov instruction.
9290   if (MI.getOpcode() == AArch64::ORRWrs &&
9291       MI.getOperand(1).getReg() == AArch64::WZR &&
9292       MI.getOperand(3).getImm() == 0x0 &&
9293       // Check that the w->w move is not a zero-extending w->x mov.
9294       (!MI.getOperand(0).getReg().isVirtual() ||
9295        MI.getOperand(0).getSubReg() == 0) &&
9296       (!MI.getOperand(0).getReg().isPhysical() ||
9297        MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9298                                         AArch64::X0,
9299                                     /*TRI=*/nullptr) == -1))
9300     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9301 
9302   if (MI.getOpcode() == AArch64::ORRXrs &&
9303       MI.getOperand(1).getReg() == AArch64::XZR &&
9304       MI.getOperand(3).getImm() == 0x0)
9305     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9306 
9307   return std::nullopt;
9308 }
9309 
9310 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const9311 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9312   if (MI.getOpcode() == AArch64::ORRWrs &&
9313       MI.getOperand(1).getReg() == AArch64::WZR &&
9314       MI.getOperand(3).getImm() == 0x0)
9315     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9316   return std::nullopt;
9317 }
9318 
9319 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const9320 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9321   int Sign = 1;
9322   int64_t Offset = 0;
9323 
9324   // TODO: Handle cases where Reg is a super- or sub-register of the
9325   // destination register.
9326   const MachineOperand &Op0 = MI.getOperand(0);
9327   if (!Op0.isReg() || Reg != Op0.getReg())
9328     return std::nullopt;
9329 
9330   switch (MI.getOpcode()) {
9331   default:
9332     return std::nullopt;
9333   case AArch64::SUBWri:
9334   case AArch64::SUBXri:
9335   case AArch64::SUBSWri:
9336   case AArch64::SUBSXri:
9337     Sign *= -1;
9338     [[fallthrough]];
9339   case AArch64::ADDSWri:
9340   case AArch64::ADDSXri:
9341   case AArch64::ADDWri:
9342   case AArch64::ADDXri: {
9343     // TODO: Third operand can be global address (usually some string).
9344     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9345         !MI.getOperand(2).isImm())
9346       return std::nullopt;
9347     int Shift = MI.getOperand(3).getImm();
9348     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9349     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9350   }
9351   }
9352   return RegImmPair{MI.getOperand(1).getReg(), Offset};
9353 }
9354 
9355 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9356 /// the destination register then, if possible, describe the value in terms of
9357 /// the source register.
9358 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)9359 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9360                        const TargetInstrInfo *TII,
9361                        const TargetRegisterInfo *TRI) {
9362   auto DestSrc = TII->isCopyLikeInstr(MI);
9363   if (!DestSrc)
9364     return std::nullopt;
9365 
9366   Register DestReg = DestSrc->Destination->getReg();
9367   Register SrcReg = DestSrc->Source->getReg();
9368 
9369   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9370 
9371   // If the described register is the destination, just return the source.
9372   if (DestReg == DescribedReg)
9373     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9374 
9375   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9376   if (MI.getOpcode() == AArch64::ORRWrs &&
9377       TRI->isSuperRegister(DestReg, DescribedReg))
9378     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9379 
9380   // We may need to describe the lower part of a ORRXrs move.
9381   if (MI.getOpcode() == AArch64::ORRXrs &&
9382       TRI->isSubRegister(DestReg, DescribedReg)) {
9383     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9384     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9385   }
9386 
9387   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9388          "Unhandled ORR[XW]rs copy case");
9389 
9390   return std::nullopt;
9391 }
9392 
isFunctionSafeToSplit(const MachineFunction & MF) const9393 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9394   // Functions cannot be split to different sections on AArch64 if they have
9395   // a red zone. This is because relaxing a cross-section branch may require
9396   // incrementing the stack pointer to spill a register, which would overwrite
9397   // the red zone.
9398   if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9399     return false;
9400 
9401   return TargetInstrInfo::isFunctionSafeToSplit(MF);
9402 }
9403 
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const9404 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9405     const MachineBasicBlock &MBB) const {
9406   // Asm Goto blocks can contain conditional branches to goto labels, which can
9407   // get moved out of range of the branch instruction.
9408   auto isAsmGoto = [](const MachineInstr &MI) {
9409     return MI.getOpcode() == AArch64::INLINEASM_BR;
9410   };
9411   if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9412     return false;
9413 
9414   // Because jump tables are label-relative instead of table-relative, they all
9415   // must be in the same section or relocation fixup handling will fail.
9416 
9417   // Check if MBB is a jump table target
9418   const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9419   auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9420     return llvm::is_contained(JTE.MBBs, &MBB);
9421   };
9422   if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9423     return false;
9424 
9425   // Check if MBB contains a jump table lookup
9426   for (const MachineInstr &MI : MBB) {
9427     switch (MI.getOpcode()) {
9428     case TargetOpcode::G_BRJT:
9429     case AArch64::JumpTableDest32:
9430     case AArch64::JumpTableDest16:
9431     case AArch64::JumpTableDest8:
9432       return false;
9433     default:
9434       continue;
9435     }
9436   }
9437 
9438   // MBB isn't a special case, so it's safe to be split to the cold section.
9439   return true;
9440 }
9441 
9442 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const9443 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9444                                       Register Reg) const {
9445   const MachineFunction *MF = MI.getMF();
9446   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9447   switch (MI.getOpcode()) {
9448   case AArch64::MOVZWi:
9449   case AArch64::MOVZXi: {
9450     // MOVZWi may be used for producing zero-extended 32-bit immediates in
9451     // 64-bit parameters, so we need to consider super-registers.
9452     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9453       return std::nullopt;
9454 
9455     if (!MI.getOperand(1).isImm())
9456       return std::nullopt;
9457     int64_t Immediate = MI.getOperand(1).getImm();
9458     int Shift = MI.getOperand(2).getImm();
9459     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9460                             nullptr);
9461   }
9462   case AArch64::ORRWrs:
9463   case AArch64::ORRXrs:
9464     return describeORRLoadedValue(MI, Reg, this, TRI);
9465   }
9466 
9467   return TargetInstrInfo::describeLoadedValue(MI, Reg);
9468 }
9469 
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const9470 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9471     MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9472   assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9473          ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9474          ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9475 
9476   // Anyexts are nops.
9477   if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9478     return true;
9479 
9480   Register DefReg = ExtMI.getOperand(0).getReg();
9481   if (!MRI.hasOneNonDBGUse(DefReg))
9482     return false;
9483 
9484   // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9485   // addressing mode.
9486   auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9487   return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9488 }
9489 
getElementSizeForOpcode(unsigned Opc) const9490 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9491   return get(Opc).TSFlags & AArch64::ElementSizeMask;
9492 }
9493 
isPTestLikeOpcode(unsigned Opc) const9494 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9495   return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9496 }
9497 
isWhileOpcode(unsigned Opc) const9498 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9499   return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9500 }
9501 
9502 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const9503 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9504   return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9505 }
9506 
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const9507 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9508                                              unsigned Scale) const {
9509   if (Offset && Scale)
9510     return false;
9511 
9512   // Check Reg + Imm
9513   if (!Scale) {
9514     // 9-bit signed offset
9515     if (isInt<9>(Offset))
9516       return true;
9517 
9518     // 12-bit unsigned offset
9519     unsigned Shift = Log2_64(NumBytes);
9520     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9521         // Must be a multiple of NumBytes (NumBytes is a power of 2)
9522         (Offset >> Shift) << Shift == Offset)
9523       return true;
9524     return false;
9525   }
9526 
9527   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9528   return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9529 }
9530 
getBLRCallOpcode(const MachineFunction & MF)9531 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9532   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9533     return AArch64::BLRNoIP;
9534   else
9535     return AArch64::BLR;
9536 }
9537 
9538 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const9539 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9540                                    Register TargetReg, bool FrameSetup) const {
9541   assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9542 
9543   MachineBasicBlock &MBB = *MBBI->getParent();
9544   MachineFunction &MF = *MBB.getParent();
9545   const AArch64InstrInfo *TII =
9546       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9547   int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9548   DebugLoc DL = MBB.findDebugLoc(MBBI);
9549 
9550   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9551   MachineBasicBlock *LoopTestMBB =
9552       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9553   MF.insert(MBBInsertPoint, LoopTestMBB);
9554   MachineBasicBlock *LoopBodyMBB =
9555       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9556   MF.insert(MBBInsertPoint, LoopBodyMBB);
9557   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9558   MF.insert(MBBInsertPoint, ExitMBB);
9559   MachineInstr::MIFlag Flags =
9560       FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9561 
9562   // LoopTest:
9563   //   SUB SP, SP, #ProbeSize
9564   emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9565                   AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9566 
9567   //   CMP SP, TargetReg
9568   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9569           AArch64::XZR)
9570       .addReg(AArch64::SP)
9571       .addReg(TargetReg)
9572       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9573       .setMIFlags(Flags);
9574 
9575   //   B.<Cond> LoopExit
9576   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9577       .addImm(AArch64CC::LE)
9578       .addMBB(ExitMBB)
9579       .setMIFlags(Flags);
9580 
9581   //   STR XZR, [SP]
9582   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9583       .addReg(AArch64::XZR)
9584       .addReg(AArch64::SP)
9585       .addImm(0)
9586       .setMIFlags(Flags);
9587 
9588   //   B loop
9589   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9590       .addMBB(LoopTestMBB)
9591       .setMIFlags(Flags);
9592 
9593   // LoopExit:
9594   //   MOV SP, TargetReg
9595   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9596       .addReg(TargetReg)
9597       .addImm(0)
9598       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9599       .setMIFlags(Flags);
9600 
9601   //   LDR XZR, [SP]
9602   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9603       .addReg(AArch64::XZR, RegState::Define)
9604       .addReg(AArch64::SP)
9605       .addImm(0)
9606       .setMIFlags(Flags);
9607 
9608   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9609   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9610 
9611   LoopTestMBB->addSuccessor(ExitMBB);
9612   LoopTestMBB->addSuccessor(LoopBodyMBB);
9613   LoopBodyMBB->addSuccessor(LoopTestMBB);
9614   MBB.addSuccessor(LoopTestMBB);
9615 
9616   // Update liveins.
9617   if (MF.getRegInfo().reservedRegsFrozen())
9618     fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
9619 
9620   return ExitMBB->begin();
9621 }
9622 
9623 namespace {
9624 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9625   MachineFunction *MF;
9626   const TargetInstrInfo *TII;
9627   const TargetRegisterInfo *TRI;
9628   MachineRegisterInfo &MRI;
9629 
9630   /// The block of the loop
9631   MachineBasicBlock *LoopBB;
9632   /// The conditional branch of the loop
9633   MachineInstr *CondBranch;
9634   /// The compare instruction for loop control
9635   MachineInstr *Comp;
9636   /// The number of the operand of the loop counter value in Comp
9637   unsigned CompCounterOprNum;
9638   /// The instruction that updates the loop counter value
9639   MachineInstr *Update;
9640   /// The number of the operand of the loop counter value in Update
9641   unsigned UpdateCounterOprNum;
9642   /// The initial value of the loop counter
9643   Register Init;
9644   /// True iff Update is a predecessor of Comp
9645   bool IsUpdatePriorComp;
9646 
9647   /// The normalized condition used by createTripCountGreaterCondition()
9648   SmallVector<MachineOperand, 4> Cond;
9649 
9650 public:
AArch64PipelinerLoopInfo(MachineBasicBlock * LoopBB,MachineInstr * CondBranch,MachineInstr * Comp,unsigned CompCounterOprNum,MachineInstr * Update,unsigned UpdateCounterOprNum,Register Init,bool IsUpdatePriorComp,const SmallVectorImpl<MachineOperand> & Cond)9651   AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
9652                            MachineInstr *Comp, unsigned CompCounterOprNum,
9653                            MachineInstr *Update, unsigned UpdateCounterOprNum,
9654                            Register Init, bool IsUpdatePriorComp,
9655                            const SmallVectorImpl<MachineOperand> &Cond)
9656       : MF(Comp->getParent()->getParent()),
9657         TII(MF->getSubtarget().getInstrInfo()),
9658         TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
9659         LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
9660         CompCounterOprNum(CompCounterOprNum), Update(Update),
9661         UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
9662         IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
9663 
shouldIgnoreForPipelining(const MachineInstr * MI) const9664   bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9665     // Make the instructions for loop control be placed in stage 0.
9666     // The predecessors of Comp are considered by the caller.
9667     return MI == Comp;
9668   }
9669 
createTripCountGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & CondParam)9670   std::optional<bool> createTripCountGreaterCondition(
9671       int TC, MachineBasicBlock &MBB,
9672       SmallVectorImpl<MachineOperand> &CondParam) override {
9673     // A branch instruction will be inserted as "if (Cond) goto epilogue".
9674     // Cond is normalized for such use.
9675     // The predecessors of the branch are assumed to have already been inserted.
9676     CondParam = Cond;
9677     return {};
9678   }
9679 
9680   void createRemainingIterationsGreaterCondition(
9681       int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9682       DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
9683 
setPreheader(MachineBasicBlock * NewPreheader)9684   void setPreheader(MachineBasicBlock *NewPreheader) override {}
9685 
adjustTripCount(int TripCountAdjust)9686   void adjustTripCount(int TripCountAdjust) override {}
9687 
disposed()9688   void disposed() override {}
isMVEExpanderSupported()9689   bool isMVEExpanderSupported() override { return true; }
9690 };
9691 } // namespace
9692 
9693 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
9694 /// is replaced by ReplaceReg. The output register is newly created.
9695 /// The other operands are unchanged from MI.
cloneInstr(const MachineInstr * MI,unsigned ReplaceOprNum,Register ReplaceReg,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertTo)9696 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
9697                            Register ReplaceReg, MachineBasicBlock &MBB,
9698                            MachineBasicBlock::iterator InsertTo) {
9699   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9700   const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
9701   const TargetRegisterInfo *TRI =
9702       MBB.getParent()->getSubtarget().getRegisterInfo();
9703   MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
9704   Register Result = 0;
9705   for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
9706     if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
9707       Result = MRI.createVirtualRegister(
9708           MRI.getRegClass(NewMI->getOperand(0).getReg()));
9709       NewMI->getOperand(I).setReg(Result);
9710     } else if (I == ReplaceOprNum) {
9711       MRI.constrainRegClass(
9712           ReplaceReg,
9713           TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
9714       NewMI->getOperand(I).setReg(ReplaceReg);
9715     }
9716   }
9717   MBB.insert(InsertTo, NewMI);
9718   return Result;
9719 }
9720 
createRemainingIterationsGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & Cond,DenseMap<MachineInstr *,MachineInstr * > & LastStage0Insts)9721 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
9722     int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9723     DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
9724   // Create and accumulate conditions for next TC iterations.
9725   // Example:
9726   //   SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
9727   //                                          # iteration of the kernel
9728   //
9729   //   # insert the following instructions
9730   //   cond = CSINCXr 0, 0, C, implicit $nzcv
9731   //   counter = ADDXri counter, 1            # clone from this->Update
9732   //   SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
9733   //   cond = CSINCXr cond, cond, C, implicit $nzcv
9734   //   ... (repeat TC times)
9735   //   SUBSXri cond, 0, implicit-def $nzcv
9736 
9737   assert(CondBranch->getOpcode() == AArch64::Bcc);
9738   // CondCode to exit the loop
9739   AArch64CC::CondCode CC =
9740       (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
9741   if (CondBranch->getOperand(1).getMBB() == LoopBB)
9742     CC = AArch64CC::getInvertedCondCode(CC);
9743 
9744   // Accumulate conditions to exit the loop
9745   Register AccCond = AArch64::XZR;
9746 
9747   // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
9748   auto AccumulateCond = [&](Register CurCond,
9749                             AArch64CC::CondCode CC) -> Register {
9750     Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
9751     BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
9752         .addReg(NewCond, RegState::Define)
9753         .addReg(CurCond)
9754         .addReg(CurCond)
9755         .addImm(AArch64CC::getInvertedCondCode(CC));
9756     return NewCond;
9757   };
9758 
9759   if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
9760     // Update and Comp for I==0 are already exists in MBB
9761     // (MBB is an unrolled kernel)
9762     Register Counter;
9763     for (int I = 0; I <= TC; ++I) {
9764       Register NextCounter;
9765       if (I != 0)
9766         NextCounter =
9767             cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9768 
9769       AccCond = AccumulateCond(AccCond, CC);
9770 
9771       if (I != TC) {
9772         if (I == 0) {
9773           if (Update != Comp && IsUpdatePriorComp) {
9774             Counter =
9775                 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9776             NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
9777                                      MBB.end());
9778           } else {
9779             // can use already calculated value
9780             NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
9781           }
9782         } else if (Update != Comp) {
9783           NextCounter =
9784               cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9785         }
9786       }
9787       Counter = NextCounter;
9788     }
9789   } else {
9790     Register Counter;
9791     if (LastStage0Insts.empty()) {
9792       // use initial counter value (testing if the trip count is sufficient to
9793       // be executed by pipelined code)
9794       Counter = Init;
9795       if (IsUpdatePriorComp)
9796         Counter =
9797             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9798     } else {
9799       // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
9800       Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9801     }
9802 
9803     for (int I = 0; I <= TC; ++I) {
9804       Register NextCounter;
9805       NextCounter =
9806           cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9807       AccCond = AccumulateCond(AccCond, CC);
9808       if (I != TC && Update != Comp)
9809         NextCounter =
9810             cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9811       Counter = NextCounter;
9812     }
9813   }
9814 
9815   // If AccCond == 0, the remainder is greater than TC.
9816   BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
9817       .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
9818       .addReg(AccCond)
9819       .addImm(0)
9820       .addImm(0);
9821   Cond.clear();
9822   Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
9823 }
9824 
extractPhiReg(const MachineInstr & Phi,const MachineBasicBlock * MBB,Register & RegMBB,Register & RegOther)9825 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
9826                           Register &RegMBB, Register &RegOther) {
9827   assert(Phi.getNumOperands() == 5);
9828   if (Phi.getOperand(2).getMBB() == MBB) {
9829     RegMBB = Phi.getOperand(1).getReg();
9830     RegOther = Phi.getOperand(3).getReg();
9831   } else {
9832     assert(Phi.getOperand(4).getMBB() == MBB);
9833     RegMBB = Phi.getOperand(3).getReg();
9834     RegOther = Phi.getOperand(1).getReg();
9835   }
9836 }
9837 
isDefinedOutside(Register Reg,const MachineBasicBlock * BB)9838 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
9839   if (!Reg.isVirtual())
9840     return false;
9841   const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
9842   return MRI.getVRegDef(Reg)->getParent() != BB;
9843 }
9844 
9845 /// If Reg is an induction variable, return true and set some parameters
getIndVarInfo(Register Reg,const MachineBasicBlock * LoopBB,MachineInstr * & UpdateInst,unsigned & UpdateCounterOprNum,Register & InitReg,bool & IsUpdatePriorComp)9846 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
9847                           MachineInstr *&UpdateInst,
9848                           unsigned &UpdateCounterOprNum, Register &InitReg,
9849                           bool &IsUpdatePriorComp) {
9850   // Example:
9851   //
9852   // Preheader:
9853   //   InitReg = ...
9854   // LoopBB:
9855   //   Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
9856   //   Reg = COPY Reg0 ; COPY is ignored.
9857   //   Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
9858   //                     ; Reg is the value calculated in the previous
9859   //                     ; iteration, so IsUpdatePriorComp == false.
9860 
9861   if (LoopBB->pred_size() != 2)
9862     return false;
9863   if (!Reg.isVirtual())
9864     return false;
9865   const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9866   UpdateInst = nullptr;
9867   UpdateCounterOprNum = 0;
9868   InitReg = 0;
9869   IsUpdatePriorComp = true;
9870   Register CurReg = Reg;
9871   while (true) {
9872     MachineInstr *Def = MRI.getVRegDef(CurReg);
9873     if (Def->getParent() != LoopBB)
9874       return false;
9875     if (Def->isCopy()) {
9876       // Ignore copy instructions unless they contain subregisters
9877       if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
9878         return false;
9879       CurReg = Def->getOperand(1).getReg();
9880     } else if (Def->isPHI()) {
9881       if (InitReg != 0)
9882         return false;
9883       if (!UpdateInst)
9884         IsUpdatePriorComp = false;
9885       extractPhiReg(*Def, LoopBB, CurReg, InitReg);
9886     } else {
9887       if (UpdateInst)
9888         return false;
9889       switch (Def->getOpcode()) {
9890       case AArch64::ADDSXri:
9891       case AArch64::ADDSWri:
9892       case AArch64::SUBSXri:
9893       case AArch64::SUBSWri:
9894       case AArch64::ADDXri:
9895       case AArch64::ADDWri:
9896       case AArch64::SUBXri:
9897       case AArch64::SUBWri:
9898         UpdateInst = Def;
9899         UpdateCounterOprNum = 1;
9900         break;
9901       case AArch64::ADDSXrr:
9902       case AArch64::ADDSWrr:
9903       case AArch64::SUBSXrr:
9904       case AArch64::SUBSWrr:
9905       case AArch64::ADDXrr:
9906       case AArch64::ADDWrr:
9907       case AArch64::SUBXrr:
9908       case AArch64::SUBWrr:
9909         UpdateInst = Def;
9910         if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
9911           UpdateCounterOprNum = 1;
9912         else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
9913           UpdateCounterOprNum = 2;
9914         else
9915           return false;
9916         break;
9917       default:
9918         return false;
9919       }
9920       CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
9921     }
9922 
9923     if (!CurReg.isVirtual())
9924       return false;
9925     if (Reg == CurReg)
9926       break;
9927   }
9928 
9929   if (!UpdateInst)
9930     return false;
9931 
9932   return true;
9933 }
9934 
9935 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock * LoopBB) const9936 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9937   // Accept loops that meet the following conditions
9938   // * The conditional branch is BCC
9939   // * The compare instruction is ADDS/SUBS/WHILEXX
9940   // * One operand of the compare is an induction variable and the other is a
9941   //   loop invariant value
9942   // * The induction variable is incremented/decremented by a single instruction
9943   // * Does not contain CALL or instructions which have unmodeled side effects
9944 
9945   for (MachineInstr &MI : *LoopBB)
9946     if (MI.isCall() || MI.hasUnmodeledSideEffects())
9947       // This instruction may use NZCV, which interferes with the instruction to
9948       // be inserted for loop control.
9949       return nullptr;
9950 
9951   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9952   SmallVector<MachineOperand, 4> Cond;
9953   if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
9954     return nullptr;
9955 
9956   // Infinite loops are not supported
9957   if (TBB == LoopBB && FBB == LoopBB)
9958     return nullptr;
9959 
9960   // Must be conditional branch
9961   if (TBB != LoopBB && FBB == nullptr)
9962     return nullptr;
9963 
9964   assert((TBB == LoopBB || FBB == LoopBB) &&
9965          "The Loop must be a single-basic-block loop");
9966 
9967   MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9968   const TargetRegisterInfo &TRI = getRegisterInfo();
9969 
9970   if (CondBranch->getOpcode() != AArch64::Bcc)
9971     return nullptr;
9972 
9973   // Normalization for createTripCountGreaterCondition()
9974   if (TBB == LoopBB)
9975     reverseBranchCondition(Cond);
9976 
9977   MachineInstr *Comp = nullptr;
9978   unsigned CompCounterOprNum = 0;
9979   for (MachineInstr &MI : reverse(*LoopBB)) {
9980     if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9981       // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
9982       // operands is a loop invariant value
9983 
9984       switch (MI.getOpcode()) {
9985       case AArch64::SUBSXri:
9986       case AArch64::SUBSWri:
9987       case AArch64::ADDSXri:
9988       case AArch64::ADDSWri:
9989         Comp = &MI;
9990         CompCounterOprNum = 1;
9991         break;
9992       case AArch64::ADDSWrr:
9993       case AArch64::ADDSXrr:
9994       case AArch64::SUBSWrr:
9995       case AArch64::SUBSXrr:
9996         Comp = &MI;
9997         break;
9998       default:
9999         if (isWhileOpcode(MI.getOpcode())) {
10000           Comp = &MI;
10001           break;
10002         }
10003         return nullptr;
10004       }
10005 
10006       if (CompCounterOprNum == 0) {
10007         if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10008           CompCounterOprNum = 2;
10009         else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10010           CompCounterOprNum = 1;
10011         else
10012           return nullptr;
10013       }
10014       break;
10015     }
10016   }
10017   if (!Comp)
10018     return nullptr;
10019 
10020   MachineInstr *Update = nullptr;
10021   Register Init;
10022   bool IsUpdatePriorComp;
10023   unsigned UpdateCounterOprNum;
10024   if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10025                      Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10026     return nullptr;
10027 
10028   return std::make_unique<AArch64PipelinerLoopInfo>(
10029       LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10030       Init, IsUpdatePriorComp, Cond);
10031 }
10032 
10033 #define GET_INSTRINFO_HELPERS
10034 #define GET_INSTRMAP_INFO
10035 #include "AArch64GenInstrInfo.inc"
10036