xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64FrameLowering.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PointerAuth.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/LivePhysRegs.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineCombinerPattern.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineInstr.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineMemOperand.h"
32 #include "llvm/CodeGen/MachineModuleInfo.h"
33 #include "llvm/CodeGen/MachineOperand.h"
34 #include "llvm/CodeGen/MachineRegisterInfo.h"
35 #include "llvm/CodeGen/RegisterScavenging.h"
36 #include "llvm/CodeGen/StackMaps.h"
37 #include "llvm/CodeGen/TargetRegisterInfo.h"
38 #include "llvm/CodeGen/TargetSubtargetInfo.h"
39 #include "llvm/IR/DebugInfoMetadata.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/IR/GlobalValue.h"
42 #include "llvm/MC/MCAsmInfo.h"
43 #include "llvm/MC/MCInst.h"
44 #include "llvm/MC/MCInstBuilder.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Support/Casting.h"
47 #include "llvm/Support/CodeGen.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Support/LEB128.h"
51 #include "llvm/Support/MathExtras.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Target/TargetOptions.h"
54 #include <cassert>
55 #include <cstdint>
56 #include <iterator>
57 #include <utility>
58 
59 using namespace llvm;
60 
61 #define GET_INSTRINFO_CTOR_DTOR
62 #include "AArch64GenInstrInfo.inc"
63 
64 static cl::opt<unsigned> TBZDisplacementBits(
65     "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
66     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
67 
68 static cl::opt<unsigned> CBZDisplacementBits(
69     "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
70     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
71 
72 static cl::opt<unsigned>
73     BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
74                         cl::desc("Restrict range of Bcc instructions (DEBUG)"));
75 
76 static cl::opt<unsigned>
77     BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
78                       cl::desc("Restrict range of B instructions (DEBUG)"));
79 
80 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
81     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
82                           AArch64::CATCHRET),
83       RI(STI.getTargetTriple()), Subtarget(STI) {}
84 
85 /// GetInstSize - Return the number of bytes of code the specified
86 /// instruction may be.  This returns the maximum number of bytes.
87 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
88   const MachineBasicBlock &MBB = *MI.getParent();
89   const MachineFunction *MF = MBB.getParent();
90   const Function &F = MF->getFunction();
91   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
92 
93   {
94     auto Op = MI.getOpcode();
95     if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
96       return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
97   }
98 
99   // Meta-instructions emit no code.
100   if (MI.isMetaInstruction())
101     return 0;
102 
103   // FIXME: We currently only handle pseudoinstructions that don't get expanded
104   //        before the assembly printer.
105   unsigned NumBytes = 0;
106   const MCInstrDesc &Desc = MI.getDesc();
107 
108   // Size should be preferably set in
109   // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
110   // Specific cases handle instructions of variable sizes
111   switch (Desc.getOpcode()) {
112   default:
113     if (Desc.getSize())
114       return Desc.getSize();
115 
116     // Anything not explicitly designated otherwise (i.e. pseudo-instructions
117     // with fixed constant size but not specified in .td file) is a normal
118     // 4-byte insn.
119     NumBytes = 4;
120     break;
121   case TargetOpcode::STACKMAP:
122     // The upper bound for a stackmap intrinsic is the full length of its shadow
123     NumBytes = StackMapOpers(&MI).getNumPatchBytes();
124     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
125     break;
126   case TargetOpcode::PATCHPOINT:
127     // The size of the patchpoint intrinsic is the number of bytes requested
128     NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
129     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
130     break;
131   case TargetOpcode::STATEPOINT:
132     NumBytes = StatepointOpers(&MI).getNumPatchBytes();
133     assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
134     // No patch bytes means a normal call inst is emitted
135     if (NumBytes == 0)
136       NumBytes = 4;
137     break;
138   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
139     // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
140     // instructions are expanded to the specified number of NOPs. Otherwise,
141     // they are expanded to 36-byte XRay sleds.
142     NumBytes =
143         F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
144     break;
145   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
146   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
147     // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
148     NumBytes = 36;
149     break;
150   case TargetOpcode::PATCHABLE_EVENT_CALL:
151     // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
152     NumBytes = 24;
153     break;
154 
155   case AArch64::SPACE:
156     NumBytes = MI.getOperand(1).getImm();
157     break;
158   case TargetOpcode::BUNDLE:
159     NumBytes = getInstBundleLength(MI);
160     break;
161   }
162 
163   return NumBytes;
164 }
165 
166 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
167   unsigned Size = 0;
168   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
169   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
170   while (++I != E && I->isInsideBundle()) {
171     assert(!I->isBundle() && "No nested bundle!");
172     Size += getInstSizeInBytes(*I);
173   }
174   return Size;
175 }
176 
177 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
178                             SmallVectorImpl<MachineOperand> &Cond) {
179   // Block ends with fall-through condbranch.
180   switch (LastInst->getOpcode()) {
181   default:
182     llvm_unreachable("Unknown branch instruction?");
183   case AArch64::Bcc:
184     Target = LastInst->getOperand(1).getMBB();
185     Cond.push_back(LastInst->getOperand(0));
186     break;
187   case AArch64::CBZW:
188   case AArch64::CBZX:
189   case AArch64::CBNZW:
190   case AArch64::CBNZX:
191     Target = LastInst->getOperand(1).getMBB();
192     Cond.push_back(MachineOperand::CreateImm(-1));
193     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
194     Cond.push_back(LastInst->getOperand(0));
195     break;
196   case AArch64::TBZW:
197   case AArch64::TBZX:
198   case AArch64::TBNZW:
199   case AArch64::TBNZX:
200     Target = LastInst->getOperand(2).getMBB();
201     Cond.push_back(MachineOperand::CreateImm(-1));
202     Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
203     Cond.push_back(LastInst->getOperand(0));
204     Cond.push_back(LastInst->getOperand(1));
205   }
206 }
207 
208 static unsigned getBranchDisplacementBits(unsigned Opc) {
209   switch (Opc) {
210   default:
211     llvm_unreachable("unexpected opcode!");
212   case AArch64::B:
213     return BDisplacementBits;
214   case AArch64::TBNZW:
215   case AArch64::TBZW:
216   case AArch64::TBNZX:
217   case AArch64::TBZX:
218     return TBZDisplacementBits;
219   case AArch64::CBNZW:
220   case AArch64::CBZW:
221   case AArch64::CBNZX:
222   case AArch64::CBZX:
223     return CBZDisplacementBits;
224   case AArch64::Bcc:
225     return BCCDisplacementBits;
226   }
227 }
228 
229 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
230                                              int64_t BrOffset) const {
231   unsigned Bits = getBranchDisplacementBits(BranchOp);
232   assert(Bits >= 3 && "max branch displacement must be enough to jump"
233                       "over conditional branch expansion");
234   return isIntN(Bits, BrOffset / 4);
235 }
236 
237 MachineBasicBlock *
238 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
239   switch (MI.getOpcode()) {
240   default:
241     llvm_unreachable("unexpected opcode!");
242   case AArch64::B:
243     return MI.getOperand(0).getMBB();
244   case AArch64::TBZW:
245   case AArch64::TBNZW:
246   case AArch64::TBZX:
247   case AArch64::TBNZX:
248     return MI.getOperand(2).getMBB();
249   case AArch64::CBZW:
250   case AArch64::CBNZW:
251   case AArch64::CBZX:
252   case AArch64::CBNZX:
253   case AArch64::Bcc:
254     return MI.getOperand(1).getMBB();
255   }
256 }
257 
258 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
259                                             MachineBasicBlock &NewDestBB,
260                                             MachineBasicBlock &RestoreBB,
261                                             const DebugLoc &DL,
262                                             int64_t BrOffset,
263                                             RegScavenger *RS) const {
264   assert(RS && "RegScavenger required for long branching");
265   assert(MBB.empty() &&
266          "new block should be inserted for expanding unconditional branch");
267   assert(MBB.pred_size() == 1);
268   assert(RestoreBB.empty() &&
269          "restore block should be inserted for restoring clobbered registers");
270 
271   auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
272     // Offsets outside of the signed 33-bit range are not supported for ADRP +
273     // ADD.
274     if (!isInt<33>(BrOffset))
275       report_fatal_error(
276           "Branch offsets outside of the signed 33-bit range not supported");
277 
278     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
279         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
280     BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
281         .addReg(Reg)
282         .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
283         .addImm(0);
284     BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
285   };
286 
287   RS->enterBasicBlockEnd(MBB);
288   // If X16 is unused, we can rely on the linker to insert a range extension
289   // thunk if NewDestBB is out of range of a single B instruction.
290   constexpr Register Reg = AArch64::X16;
291   if (!RS->isRegUsed(Reg)) {
292     insertUnconditionalBranch(MBB, &NewDestBB, DL);
293     RS->setRegUsed(Reg);
294     return;
295   }
296 
297   // If there's a free register and it's worth inflating the code size,
298   // manually insert the indirect branch.
299   Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
300   if (Scavenged != AArch64::NoRegister &&
301       MBB.getSectionID() == MBBSectionID::ColdSectionID) {
302     buildIndirectBranch(Scavenged, NewDestBB);
303     RS->setRegUsed(Scavenged);
304     return;
305   }
306 
307   // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
308   // with red zones.
309   AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
310   if (!AFI || AFI->hasRedZone().value_or(true))
311     report_fatal_error(
312         "Unable to insert indirect branch inside function that has red zone");
313 
314   // Otherwise, spill X16 and defer range extension to the linker.
315   BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
316       .addReg(AArch64::SP, RegState::Define)
317       .addReg(Reg)
318       .addReg(AArch64::SP)
319       .addImm(-16);
320 
321   BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
322 
323   BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
324       .addReg(AArch64::SP, RegState::Define)
325       .addReg(Reg, RegState::Define)
326       .addReg(AArch64::SP)
327       .addImm(16);
328 }
329 
330 // Branch analysis.
331 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
332                                      MachineBasicBlock *&TBB,
333                                      MachineBasicBlock *&FBB,
334                                      SmallVectorImpl<MachineOperand> &Cond,
335                                      bool AllowModify) const {
336   // If the block has no terminators, it just falls into the block after it.
337   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
338   if (I == MBB.end())
339     return false;
340 
341   // Skip over SpeculationBarrierEndBB terminators
342   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
343       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
344     --I;
345   }
346 
347   if (!isUnpredicatedTerminator(*I))
348     return false;
349 
350   // Get the last instruction in the block.
351   MachineInstr *LastInst = &*I;
352 
353   // If there is only one terminator instruction, process it.
354   unsigned LastOpc = LastInst->getOpcode();
355   if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
356     if (isUncondBranchOpcode(LastOpc)) {
357       TBB = LastInst->getOperand(0).getMBB();
358       return false;
359     }
360     if (isCondBranchOpcode(LastOpc)) {
361       // Block ends with fall-through condbranch.
362       parseCondBranch(LastInst, TBB, Cond);
363       return false;
364     }
365     return true; // Can't handle indirect branch.
366   }
367 
368   // Get the instruction before it if it is a terminator.
369   MachineInstr *SecondLastInst = &*I;
370   unsigned SecondLastOpc = SecondLastInst->getOpcode();
371 
372   // If AllowModify is true and the block ends with two or more unconditional
373   // branches, delete all but the first unconditional branch.
374   if (AllowModify && isUncondBranchOpcode(LastOpc)) {
375     while (isUncondBranchOpcode(SecondLastOpc)) {
376       LastInst->eraseFromParent();
377       LastInst = SecondLastInst;
378       LastOpc = LastInst->getOpcode();
379       if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
380         // Return now the only terminator is an unconditional branch.
381         TBB = LastInst->getOperand(0).getMBB();
382         return false;
383       }
384       SecondLastInst = &*I;
385       SecondLastOpc = SecondLastInst->getOpcode();
386     }
387   }
388 
389   // If we're allowed to modify and the block ends in a unconditional branch
390   // which could simply fallthrough, remove the branch.  (Note: This case only
391   // matters when we can't understand the whole sequence, otherwise it's also
392   // handled by BranchFolding.cpp.)
393   if (AllowModify && isUncondBranchOpcode(LastOpc) &&
394       MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
395     LastInst->eraseFromParent();
396     LastInst = SecondLastInst;
397     LastOpc = LastInst->getOpcode();
398     if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
399       assert(!isUncondBranchOpcode(LastOpc) &&
400              "unreachable unconditional branches removed above");
401 
402       if (isCondBranchOpcode(LastOpc)) {
403         // Block ends with fall-through condbranch.
404         parseCondBranch(LastInst, TBB, Cond);
405         return false;
406       }
407       return true; // Can't handle indirect branch.
408     }
409     SecondLastInst = &*I;
410     SecondLastOpc = SecondLastInst->getOpcode();
411   }
412 
413   // If there are three terminators, we don't know what sort of block this is.
414   if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
415     return true;
416 
417   // If the block ends with a B and a Bcc, handle it.
418   if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
419     parseCondBranch(SecondLastInst, TBB, Cond);
420     FBB = LastInst->getOperand(0).getMBB();
421     return false;
422   }
423 
424   // If the block ends with two unconditional branches, handle it.  The second
425   // one is not executed, so remove it.
426   if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
427     TBB = SecondLastInst->getOperand(0).getMBB();
428     I = LastInst;
429     if (AllowModify)
430       I->eraseFromParent();
431     return false;
432   }
433 
434   // ...likewise if it ends with an indirect branch followed by an unconditional
435   // branch.
436   if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
437     I = LastInst;
438     if (AllowModify)
439       I->eraseFromParent();
440     return true;
441   }
442 
443   // Otherwise, can't handle this.
444   return true;
445 }
446 
447 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
448                                               MachineBranchPredicate &MBP,
449                                               bool AllowModify) const {
450   // For the moment, handle only a block which ends with a cb(n)zx followed by
451   // a fallthrough.  Why this?  Because it is a common form.
452   // TODO: Should we handle b.cc?
453 
454   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
455   if (I == MBB.end())
456     return true;
457 
458   // Skip over SpeculationBarrierEndBB terminators
459   if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
460       I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
461     --I;
462   }
463 
464   if (!isUnpredicatedTerminator(*I))
465     return true;
466 
467   // Get the last instruction in the block.
468   MachineInstr *LastInst = &*I;
469   unsigned LastOpc = LastInst->getOpcode();
470   if (!isCondBranchOpcode(LastOpc))
471     return true;
472 
473   switch (LastOpc) {
474   default:
475     return true;
476   case AArch64::CBZW:
477   case AArch64::CBZX:
478   case AArch64::CBNZW:
479   case AArch64::CBNZX:
480     break;
481   };
482 
483   MBP.TrueDest = LastInst->getOperand(1).getMBB();
484   assert(MBP.TrueDest && "expected!");
485   MBP.FalseDest = MBB.getNextNode();
486 
487   MBP.ConditionDef = nullptr;
488   MBP.SingleUseCondition = false;
489 
490   MBP.LHS = LastInst->getOperand(0);
491   MBP.RHS = MachineOperand::CreateImm(0);
492   MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
493                                             : MachineBranchPredicate::PRED_EQ;
494   return false;
495 }
496 
497 bool AArch64InstrInfo::reverseBranchCondition(
498     SmallVectorImpl<MachineOperand> &Cond) const {
499   if (Cond[0].getImm() != -1) {
500     // Regular Bcc
501     AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
502     Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
503   } else {
504     // Folded compare-and-branch
505     switch (Cond[1].getImm()) {
506     default:
507       llvm_unreachable("Unknown conditional branch!");
508     case AArch64::CBZW:
509       Cond[1].setImm(AArch64::CBNZW);
510       break;
511     case AArch64::CBNZW:
512       Cond[1].setImm(AArch64::CBZW);
513       break;
514     case AArch64::CBZX:
515       Cond[1].setImm(AArch64::CBNZX);
516       break;
517     case AArch64::CBNZX:
518       Cond[1].setImm(AArch64::CBZX);
519       break;
520     case AArch64::TBZW:
521       Cond[1].setImm(AArch64::TBNZW);
522       break;
523     case AArch64::TBNZW:
524       Cond[1].setImm(AArch64::TBZW);
525       break;
526     case AArch64::TBZX:
527       Cond[1].setImm(AArch64::TBNZX);
528       break;
529     case AArch64::TBNZX:
530       Cond[1].setImm(AArch64::TBZX);
531       break;
532     }
533   }
534 
535   return false;
536 }
537 
538 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
539                                         int *BytesRemoved) const {
540   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
541   if (I == MBB.end())
542     return 0;
543 
544   if (!isUncondBranchOpcode(I->getOpcode()) &&
545       !isCondBranchOpcode(I->getOpcode()))
546     return 0;
547 
548   // Remove the branch.
549   I->eraseFromParent();
550 
551   I = MBB.end();
552 
553   if (I == MBB.begin()) {
554     if (BytesRemoved)
555       *BytesRemoved = 4;
556     return 1;
557   }
558   --I;
559   if (!isCondBranchOpcode(I->getOpcode())) {
560     if (BytesRemoved)
561       *BytesRemoved = 4;
562     return 1;
563   }
564 
565   // Remove the branch.
566   I->eraseFromParent();
567   if (BytesRemoved)
568     *BytesRemoved = 8;
569 
570   return 2;
571 }
572 
573 void AArch64InstrInfo::instantiateCondBranch(
574     MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
575     ArrayRef<MachineOperand> Cond) const {
576   if (Cond[0].getImm() != -1) {
577     // Regular Bcc
578     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
579   } else {
580     // Folded compare-and-branch
581     // Note that we use addOperand instead of addReg to keep the flags.
582     const MachineInstrBuilder MIB =
583         BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
584     if (Cond.size() > 3)
585       MIB.addImm(Cond[3].getImm());
586     MIB.addMBB(TBB);
587   }
588 }
589 
590 unsigned AArch64InstrInfo::insertBranch(
591     MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
592     ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
593   // Shouldn't be a fall through.
594   assert(TBB && "insertBranch must not be told to insert a fallthrough");
595 
596   if (!FBB) {
597     if (Cond.empty()) // Unconditional branch?
598       BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
599     else
600       instantiateCondBranch(MBB, DL, TBB, Cond);
601 
602     if (BytesAdded)
603       *BytesAdded = 4;
604 
605     return 1;
606   }
607 
608   // Two-way conditional branch.
609   instantiateCondBranch(MBB, DL, TBB, Cond);
610   BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
611 
612   if (BytesAdded)
613     *BytesAdded = 8;
614 
615   return 2;
616 }
617 
618 // Find the original register that VReg is copied from.
619 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
620   while (Register::isVirtualRegister(VReg)) {
621     const MachineInstr *DefMI = MRI.getVRegDef(VReg);
622     if (!DefMI->isFullCopy())
623       return VReg;
624     VReg = DefMI->getOperand(1).getReg();
625   }
626   return VReg;
627 }
628 
629 // Determine if VReg is defined by an instruction that can be folded into a
630 // csel instruction. If so, return the folded opcode, and the replacement
631 // register.
632 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
633                                 unsigned *NewVReg = nullptr) {
634   VReg = removeCopies(MRI, VReg);
635   if (!Register::isVirtualRegister(VReg))
636     return 0;
637 
638   bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
639   const MachineInstr *DefMI = MRI.getVRegDef(VReg);
640   unsigned Opc = 0;
641   unsigned SrcOpNum = 0;
642   switch (DefMI->getOpcode()) {
643   case AArch64::ADDSXri:
644   case AArch64::ADDSWri:
645     // if NZCV is used, do not fold.
646     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
647       return 0;
648     // fall-through to ADDXri and ADDWri.
649     [[fallthrough]];
650   case AArch64::ADDXri:
651   case AArch64::ADDWri:
652     // add x, 1 -> csinc.
653     if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
654         DefMI->getOperand(3).getImm() != 0)
655       return 0;
656     SrcOpNum = 1;
657     Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
658     break;
659 
660   case AArch64::ORNXrr:
661   case AArch64::ORNWrr: {
662     // not x -> csinv, represented as orn dst, xzr, src.
663     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
664     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
665       return 0;
666     SrcOpNum = 2;
667     Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
668     break;
669   }
670 
671   case AArch64::SUBSXrr:
672   case AArch64::SUBSWrr:
673     // if NZCV is used, do not fold.
674     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
675       return 0;
676     // fall-through to SUBXrr and SUBWrr.
677     [[fallthrough]];
678   case AArch64::SUBXrr:
679   case AArch64::SUBWrr: {
680     // neg x -> csneg, represented as sub dst, xzr, src.
681     unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
682     if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
683       return 0;
684     SrcOpNum = 2;
685     Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
686     break;
687   }
688   default:
689     return 0;
690   }
691   assert(Opc && SrcOpNum && "Missing parameters");
692 
693   if (NewVReg)
694     *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
695   return Opc;
696 }
697 
698 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
699                                        ArrayRef<MachineOperand> Cond,
700                                        Register DstReg, Register TrueReg,
701                                        Register FalseReg, int &CondCycles,
702                                        int &TrueCycles,
703                                        int &FalseCycles) const {
704   // Check register classes.
705   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
706   const TargetRegisterClass *RC =
707       RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
708   if (!RC)
709     return false;
710 
711   // Also need to check the dest regclass, in case we're trying to optimize
712   // something like:
713   // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
714   if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
715     return false;
716 
717   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
718   unsigned ExtraCondLat = Cond.size() != 1;
719 
720   // GPRs are handled by csel.
721   // FIXME: Fold in x+1, -x, and ~x when applicable.
722   if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
723       AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
724     // Single-cycle csel, csinc, csinv, and csneg.
725     CondCycles = 1 + ExtraCondLat;
726     TrueCycles = FalseCycles = 1;
727     if (canFoldIntoCSel(MRI, TrueReg))
728       TrueCycles = 0;
729     else if (canFoldIntoCSel(MRI, FalseReg))
730       FalseCycles = 0;
731     return true;
732   }
733 
734   // Scalar floating point is handled by fcsel.
735   // FIXME: Form fabs, fmin, and fmax when applicable.
736   if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
737       AArch64::FPR32RegClass.hasSubClassEq(RC)) {
738     CondCycles = 5 + ExtraCondLat;
739     TrueCycles = FalseCycles = 2;
740     return true;
741   }
742 
743   // Can't do vectors.
744   return false;
745 }
746 
747 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
748                                     MachineBasicBlock::iterator I,
749                                     const DebugLoc &DL, Register DstReg,
750                                     ArrayRef<MachineOperand> Cond,
751                                     Register TrueReg, Register FalseReg) const {
752   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
753 
754   // Parse the condition code, see parseCondBranch() above.
755   AArch64CC::CondCode CC;
756   switch (Cond.size()) {
757   default:
758     llvm_unreachable("Unknown condition opcode in Cond");
759   case 1: // b.cc
760     CC = AArch64CC::CondCode(Cond[0].getImm());
761     break;
762   case 3: { // cbz/cbnz
763     // We must insert a compare against 0.
764     bool Is64Bit;
765     switch (Cond[1].getImm()) {
766     default:
767       llvm_unreachable("Unknown branch opcode in Cond");
768     case AArch64::CBZW:
769       Is64Bit = false;
770       CC = AArch64CC::EQ;
771       break;
772     case AArch64::CBZX:
773       Is64Bit = true;
774       CC = AArch64CC::EQ;
775       break;
776     case AArch64::CBNZW:
777       Is64Bit = false;
778       CC = AArch64CC::NE;
779       break;
780     case AArch64::CBNZX:
781       Is64Bit = true;
782       CC = AArch64CC::NE;
783       break;
784     }
785     Register SrcReg = Cond[2].getReg();
786     if (Is64Bit) {
787       // cmp reg, #0 is actually subs xzr, reg, #0.
788       MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
789       BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
790           .addReg(SrcReg)
791           .addImm(0)
792           .addImm(0);
793     } else {
794       MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
795       BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
796           .addReg(SrcReg)
797           .addImm(0)
798           .addImm(0);
799     }
800     break;
801   }
802   case 4: { // tbz/tbnz
803     // We must insert a tst instruction.
804     switch (Cond[1].getImm()) {
805     default:
806       llvm_unreachable("Unknown branch opcode in Cond");
807     case AArch64::TBZW:
808     case AArch64::TBZX:
809       CC = AArch64CC::EQ;
810       break;
811     case AArch64::TBNZW:
812     case AArch64::TBNZX:
813       CC = AArch64CC::NE;
814       break;
815     }
816     // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
817     if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
818       BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
819           .addReg(Cond[2].getReg())
820           .addImm(
821               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
822     else
823       BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
824           .addReg(Cond[2].getReg())
825           .addImm(
826               AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
827     break;
828   }
829   }
830 
831   unsigned Opc = 0;
832   const TargetRegisterClass *RC = nullptr;
833   bool TryFold = false;
834   if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
835     RC = &AArch64::GPR64RegClass;
836     Opc = AArch64::CSELXr;
837     TryFold = true;
838   } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
839     RC = &AArch64::GPR32RegClass;
840     Opc = AArch64::CSELWr;
841     TryFold = true;
842   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
843     RC = &AArch64::FPR64RegClass;
844     Opc = AArch64::FCSELDrrr;
845   } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
846     RC = &AArch64::FPR32RegClass;
847     Opc = AArch64::FCSELSrrr;
848   }
849   assert(RC && "Unsupported regclass");
850 
851   // Try folding simple instructions into the csel.
852   if (TryFold) {
853     unsigned NewVReg = 0;
854     unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
855     if (FoldedOpc) {
856       // The folded opcodes csinc, csinc and csneg apply the operation to
857       // FalseReg, so we need to invert the condition.
858       CC = AArch64CC::getInvertedCondCode(CC);
859       TrueReg = FalseReg;
860     } else
861       FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
862 
863     // Fold the operation. Leave any dead instructions for DCE to clean up.
864     if (FoldedOpc) {
865       FalseReg = NewVReg;
866       Opc = FoldedOpc;
867       // The extends the live range of NewVReg.
868       MRI.clearKillFlags(NewVReg);
869     }
870   }
871 
872   // Pull all virtual register into the appropriate class.
873   MRI.constrainRegClass(TrueReg, RC);
874   MRI.constrainRegClass(FalseReg, RC);
875 
876   // Insert the csel.
877   BuildMI(MBB, I, DL, get(Opc), DstReg)
878       .addReg(TrueReg)
879       .addReg(FalseReg)
880       .addImm(CC);
881 }
882 
883 // Return true if Imm can be loaded into a register by a "cheap" sequence of
884 // instructions. For now, "cheap" means at most two instructions.
885 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
886   if (BitSize == 32)
887     return true;
888 
889   assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
890   uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
891   SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
892   AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
893 
894   return Is.size() <= 2;
895 }
896 
897 // FIXME: this implementation should be micro-architecture dependent, so a
898 // micro-architecture target hook should be introduced here in future.
899 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
900   if (Subtarget.hasExynosCheapAsMoveHandling()) {
901     if (isExynosCheapAsMove(MI))
902       return true;
903     return MI.isAsCheapAsAMove();
904   }
905 
906   switch (MI.getOpcode()) {
907   default:
908     return MI.isAsCheapAsAMove();
909 
910   case AArch64::ADDWrs:
911   case AArch64::ADDXrs:
912   case AArch64::SUBWrs:
913   case AArch64::SUBXrs:
914     return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
915 
916   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
917   // ORRXri, it is as cheap as MOV.
918   // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
919   case AArch64::MOVi32imm:
920     return isCheapImmediate(MI, 32);
921   case AArch64::MOVi64imm:
922     return isCheapImmediate(MI, 64);
923   }
924 }
925 
926 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
927   switch (MI.getOpcode()) {
928   default:
929     return false;
930 
931   case AArch64::ADDWrs:
932   case AArch64::ADDXrs:
933   case AArch64::ADDSWrs:
934   case AArch64::ADDSXrs: {
935     unsigned Imm = MI.getOperand(3).getImm();
936     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
937     if (ShiftVal == 0)
938       return true;
939     return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
940   }
941 
942   case AArch64::ADDWrx:
943   case AArch64::ADDXrx:
944   case AArch64::ADDXrx64:
945   case AArch64::ADDSWrx:
946   case AArch64::ADDSXrx:
947   case AArch64::ADDSXrx64: {
948     unsigned Imm = MI.getOperand(3).getImm();
949     switch (AArch64_AM::getArithExtendType(Imm)) {
950     default:
951       return false;
952     case AArch64_AM::UXTB:
953     case AArch64_AM::UXTH:
954     case AArch64_AM::UXTW:
955     case AArch64_AM::UXTX:
956       return AArch64_AM::getArithShiftValue(Imm) <= 4;
957     }
958   }
959 
960   case AArch64::SUBWrs:
961   case AArch64::SUBSWrs: {
962     unsigned Imm = MI.getOperand(3).getImm();
963     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
964     return ShiftVal == 0 ||
965            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
966   }
967 
968   case AArch64::SUBXrs:
969   case AArch64::SUBSXrs: {
970     unsigned Imm = MI.getOperand(3).getImm();
971     unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
972     return ShiftVal == 0 ||
973            (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
974   }
975 
976   case AArch64::SUBWrx:
977   case AArch64::SUBXrx:
978   case AArch64::SUBXrx64:
979   case AArch64::SUBSWrx:
980   case AArch64::SUBSXrx:
981   case AArch64::SUBSXrx64: {
982     unsigned Imm = MI.getOperand(3).getImm();
983     switch (AArch64_AM::getArithExtendType(Imm)) {
984     default:
985       return false;
986     case AArch64_AM::UXTB:
987     case AArch64_AM::UXTH:
988     case AArch64_AM::UXTW:
989     case AArch64_AM::UXTX:
990       return AArch64_AM::getArithShiftValue(Imm) == 0;
991     }
992   }
993 
994   case AArch64::LDRBBroW:
995   case AArch64::LDRBBroX:
996   case AArch64::LDRBroW:
997   case AArch64::LDRBroX:
998   case AArch64::LDRDroW:
999   case AArch64::LDRDroX:
1000   case AArch64::LDRHHroW:
1001   case AArch64::LDRHHroX:
1002   case AArch64::LDRHroW:
1003   case AArch64::LDRHroX:
1004   case AArch64::LDRQroW:
1005   case AArch64::LDRQroX:
1006   case AArch64::LDRSBWroW:
1007   case AArch64::LDRSBWroX:
1008   case AArch64::LDRSBXroW:
1009   case AArch64::LDRSBXroX:
1010   case AArch64::LDRSHWroW:
1011   case AArch64::LDRSHWroX:
1012   case AArch64::LDRSHXroW:
1013   case AArch64::LDRSHXroX:
1014   case AArch64::LDRSWroW:
1015   case AArch64::LDRSWroX:
1016   case AArch64::LDRSroW:
1017   case AArch64::LDRSroX:
1018   case AArch64::LDRWroW:
1019   case AArch64::LDRWroX:
1020   case AArch64::LDRXroW:
1021   case AArch64::LDRXroX:
1022   case AArch64::PRFMroW:
1023   case AArch64::PRFMroX:
1024   case AArch64::STRBBroW:
1025   case AArch64::STRBBroX:
1026   case AArch64::STRBroW:
1027   case AArch64::STRBroX:
1028   case AArch64::STRDroW:
1029   case AArch64::STRDroX:
1030   case AArch64::STRHHroW:
1031   case AArch64::STRHHroX:
1032   case AArch64::STRHroW:
1033   case AArch64::STRHroX:
1034   case AArch64::STRQroW:
1035   case AArch64::STRQroX:
1036   case AArch64::STRSroW:
1037   case AArch64::STRSroX:
1038   case AArch64::STRWroW:
1039   case AArch64::STRWroX:
1040   case AArch64::STRXroW:
1041   case AArch64::STRXroX: {
1042     unsigned IsSigned = MI.getOperand(3).getImm();
1043     return !IsSigned;
1044   }
1045   }
1046 }
1047 
1048 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1049   unsigned Opc = MI.getOpcode();
1050   switch (Opc) {
1051     default:
1052       return false;
1053     case AArch64::SEH_StackAlloc:
1054     case AArch64::SEH_SaveFPLR:
1055     case AArch64::SEH_SaveFPLR_X:
1056     case AArch64::SEH_SaveReg:
1057     case AArch64::SEH_SaveReg_X:
1058     case AArch64::SEH_SaveRegP:
1059     case AArch64::SEH_SaveRegP_X:
1060     case AArch64::SEH_SaveFReg:
1061     case AArch64::SEH_SaveFReg_X:
1062     case AArch64::SEH_SaveFRegP:
1063     case AArch64::SEH_SaveFRegP_X:
1064     case AArch64::SEH_SetFP:
1065     case AArch64::SEH_AddFP:
1066     case AArch64::SEH_Nop:
1067     case AArch64::SEH_PrologEnd:
1068     case AArch64::SEH_EpilogStart:
1069     case AArch64::SEH_EpilogEnd:
1070     case AArch64::SEH_PACSignLR:
1071       return true;
1072   }
1073 }
1074 
1075 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1076                                              Register &SrcReg, Register &DstReg,
1077                                              unsigned &SubIdx) const {
1078   switch (MI.getOpcode()) {
1079   default:
1080     return false;
1081   case AArch64::SBFMXri: // aka sxtw
1082   case AArch64::UBFMXri: // aka uxtw
1083     // Check for the 32 -> 64 bit extension case, these instructions can do
1084     // much more.
1085     if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1086       return false;
1087     // This is a signed or unsigned 32 -> 64 bit extension.
1088     SrcReg = MI.getOperand(1).getReg();
1089     DstReg = MI.getOperand(0).getReg();
1090     SubIdx = AArch64::sub_32;
1091     return true;
1092   }
1093 }
1094 
1095 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1096     const MachineInstr &MIa, const MachineInstr &MIb) const {
1097   const TargetRegisterInfo *TRI = &getRegisterInfo();
1098   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1099   int64_t OffsetA = 0, OffsetB = 0;
1100   TypeSize WidthA(0, false), WidthB(0, false);
1101   bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1102 
1103   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1104   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1105 
1106   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1107       MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1108     return false;
1109 
1110   // Retrieve the base, offset from the base and width. Width
1111   // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
1112   // base are identical, and the offset of a lower memory access +
1113   // the width doesn't overlap the offset of a higher memory access,
1114   // then the memory accesses are different.
1115   // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1116   // are assumed to have the same scale (vscale).
1117   if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1118                                    WidthA, TRI) &&
1119       getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1120                                    WidthB, TRI)) {
1121     if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1122         OffsetAIsScalable == OffsetBIsScalable) {
1123       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1124       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1125       TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1126       if (LowWidth.isScalable() == OffsetAIsScalable &&
1127           LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1128         return true;
1129     }
1130   }
1131   return false;
1132 }
1133 
1134 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1135                                             const MachineBasicBlock *MBB,
1136                                             const MachineFunction &MF) const {
1137   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1138     return true;
1139 
1140   // Do not move an instruction that can be recognized as a branch target.
1141   if (hasBTISemantics(MI))
1142     return true;
1143 
1144   switch (MI.getOpcode()) {
1145   case AArch64::HINT:
1146     // CSDB hints are scheduling barriers.
1147     if (MI.getOperand(0).getImm() == 0x14)
1148       return true;
1149     break;
1150   case AArch64::DSB:
1151   case AArch64::ISB:
1152     // DSB and ISB also are scheduling barriers.
1153     return true;
1154   case AArch64::MSRpstatesvcrImm1:
1155     // SMSTART and SMSTOP are also scheduling barriers.
1156     return true;
1157   default:;
1158   }
1159   if (isSEHInstruction(MI))
1160     return true;
1161   auto Next = std::next(MI.getIterator());
1162   return Next != MBB->end() && Next->isCFIInstruction();
1163 }
1164 
1165 /// analyzeCompare - For a comparison instruction, return the source registers
1166 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1167 /// Return true if the comparison instruction can be analyzed.
1168 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1169                                       Register &SrcReg2, int64_t &CmpMask,
1170                                       int64_t &CmpValue) const {
1171   // The first operand can be a frame index where we'd normally expect a
1172   // register.
1173   assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1174   if (!MI.getOperand(1).isReg())
1175     return false;
1176 
1177   switch (MI.getOpcode()) {
1178   default:
1179     break;
1180   case AArch64::PTEST_PP:
1181   case AArch64::PTEST_PP_ANY:
1182     SrcReg = MI.getOperand(0).getReg();
1183     SrcReg2 = MI.getOperand(1).getReg();
1184     // Not sure about the mask and value for now...
1185     CmpMask = ~0;
1186     CmpValue = 0;
1187     return true;
1188   case AArch64::SUBSWrr:
1189   case AArch64::SUBSWrs:
1190   case AArch64::SUBSWrx:
1191   case AArch64::SUBSXrr:
1192   case AArch64::SUBSXrs:
1193   case AArch64::SUBSXrx:
1194   case AArch64::ADDSWrr:
1195   case AArch64::ADDSWrs:
1196   case AArch64::ADDSWrx:
1197   case AArch64::ADDSXrr:
1198   case AArch64::ADDSXrs:
1199   case AArch64::ADDSXrx:
1200     // Replace SUBSWrr with SUBWrr if NZCV is not used.
1201     SrcReg = MI.getOperand(1).getReg();
1202     SrcReg2 = MI.getOperand(2).getReg();
1203     CmpMask = ~0;
1204     CmpValue = 0;
1205     return true;
1206   case AArch64::SUBSWri:
1207   case AArch64::ADDSWri:
1208   case AArch64::SUBSXri:
1209   case AArch64::ADDSXri:
1210     SrcReg = MI.getOperand(1).getReg();
1211     SrcReg2 = 0;
1212     CmpMask = ~0;
1213     CmpValue = MI.getOperand(2).getImm();
1214     return true;
1215   case AArch64::ANDSWri:
1216   case AArch64::ANDSXri:
1217     // ANDS does not use the same encoding scheme as the others xxxS
1218     // instructions.
1219     SrcReg = MI.getOperand(1).getReg();
1220     SrcReg2 = 0;
1221     CmpMask = ~0;
1222     CmpValue = AArch64_AM::decodeLogicalImmediate(
1223                    MI.getOperand(2).getImm(),
1224                    MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1225     return true;
1226   }
1227 
1228   return false;
1229 }
1230 
1231 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1232   MachineBasicBlock *MBB = Instr.getParent();
1233   assert(MBB && "Can't get MachineBasicBlock here");
1234   MachineFunction *MF = MBB->getParent();
1235   assert(MF && "Can't get MachineFunction here");
1236   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1237   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1238   MachineRegisterInfo *MRI = &MF->getRegInfo();
1239 
1240   for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1241        ++OpIdx) {
1242     MachineOperand &MO = Instr.getOperand(OpIdx);
1243     const TargetRegisterClass *OpRegCstraints =
1244         Instr.getRegClassConstraint(OpIdx, TII, TRI);
1245 
1246     // If there's no constraint, there's nothing to do.
1247     if (!OpRegCstraints)
1248       continue;
1249     // If the operand is a frame index, there's nothing to do here.
1250     // A frame index operand will resolve correctly during PEI.
1251     if (MO.isFI())
1252       continue;
1253 
1254     assert(MO.isReg() &&
1255            "Operand has register constraints without being a register!");
1256 
1257     Register Reg = MO.getReg();
1258     if (Reg.isPhysical()) {
1259       if (!OpRegCstraints->contains(Reg))
1260         return false;
1261     } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1262                !MRI->constrainRegClass(Reg, OpRegCstraints))
1263       return false;
1264   }
1265 
1266   return true;
1267 }
1268 
1269 /// Return the opcode that does not set flags when possible - otherwise
1270 /// return the original opcode. The caller is responsible to do the actual
1271 /// substitution and legality checking.
1272 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1273   // Don't convert all compare instructions, because for some the zero register
1274   // encoding becomes the sp register.
1275   bool MIDefinesZeroReg = false;
1276   if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
1277     MIDefinesZeroReg = true;
1278 
1279   switch (MI.getOpcode()) {
1280   default:
1281     return MI.getOpcode();
1282   case AArch64::ADDSWrr:
1283     return AArch64::ADDWrr;
1284   case AArch64::ADDSWri:
1285     return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1286   case AArch64::ADDSWrs:
1287     return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1288   case AArch64::ADDSWrx:
1289     return AArch64::ADDWrx;
1290   case AArch64::ADDSXrr:
1291     return AArch64::ADDXrr;
1292   case AArch64::ADDSXri:
1293     return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1294   case AArch64::ADDSXrs:
1295     return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1296   case AArch64::ADDSXrx:
1297     return AArch64::ADDXrx;
1298   case AArch64::SUBSWrr:
1299     return AArch64::SUBWrr;
1300   case AArch64::SUBSWri:
1301     return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1302   case AArch64::SUBSWrs:
1303     return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1304   case AArch64::SUBSWrx:
1305     return AArch64::SUBWrx;
1306   case AArch64::SUBSXrr:
1307     return AArch64::SUBXrr;
1308   case AArch64::SUBSXri:
1309     return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1310   case AArch64::SUBSXrs:
1311     return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1312   case AArch64::SUBSXrx:
1313     return AArch64::SUBXrx;
1314   }
1315 }
1316 
1317 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1318 
1319 /// True when condition flags are accessed (either by writing or reading)
1320 /// on the instruction trace starting at From and ending at To.
1321 ///
1322 /// Note: If From and To are from different blocks it's assumed CC are accessed
1323 ///       on the path.
1324 static bool areCFlagsAccessedBetweenInstrs(
1325     MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1326     const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1327   // Early exit if To is at the beginning of the BB.
1328   if (To == To->getParent()->begin())
1329     return true;
1330 
1331   // Check whether the instructions are in the same basic block
1332   // If not, assume the condition flags might get modified somewhere.
1333   if (To->getParent() != From->getParent())
1334     return true;
1335 
1336   // From must be above To.
1337   assert(std::any_of(
1338       ++To.getReverse(), To->getParent()->rend(),
1339       [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1340 
1341   // We iterate backward starting at \p To until we hit \p From.
1342   for (const MachineInstr &Instr :
1343        instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1344     if (((AccessToCheck & AK_Write) &&
1345          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1346         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1347       return true;
1348   }
1349   return false;
1350 }
1351 
1352 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1353 /// operation which could set the flags in an identical manner
1354 bool AArch64InstrInfo::optimizePTestInstr(
1355     MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1356     const MachineRegisterInfo *MRI) const {
1357   auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1358   auto *Pred = MRI->getUniqueVRegDef(PredReg);
1359   auto NewOp = Pred->getOpcode();
1360   bool OpChanged = false;
1361 
1362   unsigned MaskOpcode = Mask->getOpcode();
1363   unsigned PredOpcode = Pred->getOpcode();
1364   bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1365   bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1366 
1367   if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
1368       getElementSizeForOpcode(MaskOpcode) ==
1369           getElementSizeForOpcode(PredOpcode) &&
1370       Mask->getOperand(1).getImm() == 31) {
1371     // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1372     // redundant since WHILE performs an implicit PTEST with an all active
1373     // mask. Must be an all active predicate of matching element size.
1374 
1375     // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1376     // PTEST_LIKE instruction uses the same all active mask and the element
1377     // size matches. If the PTEST has a condition of any then it is always
1378     // redundant.
1379     if (PredIsPTestLike) {
1380       auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1381       if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
1382         return false;
1383     }
1384 
1385     // Fallthough to simply remove the PTEST.
1386   } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
1387              PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
1388     // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1389     // instruction that sets the flags as PTEST would. This is only valid when
1390     // the condition is any.
1391 
1392     // Fallthough to simply remove the PTEST.
1393   } else if (PredIsPTestLike) {
1394     // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1395     // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1396     // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
1397     // compare that also support 16/32/64-bit predicates, the implicit PTEST
1398     // performed by the compare could consider fewer lanes for these element
1399     // sizes.
1400     //
1401     // For example, consider
1402     //
1403     //   ptrue p0.b                    ; P0=1111-1111-1111-1111
1404     //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
1405     //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
1406     //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
1407     //                                 ;       ^ last active
1408     //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
1409     //                                 ;     ^ last active
1410     //
1411     // where the compare generates a canonical all active 32-bit predicate
1412     // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1413     // active flag, whereas the PTEST instruction with the same mask doesn't.
1414     // For PTEST_ANY this doesn't apply as the flags in this case would be
1415     // identical regardless of element size.
1416     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1417     uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1418     if ((Mask != PTestLikeMask) ||
1419         (PredElementSize != AArch64::ElementSizeB &&
1420          PTest->getOpcode() != AArch64::PTEST_PP_ANY))
1421       return false;
1422 
1423     // Fallthough to simply remove the PTEST.
1424   } else {
1425     // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1426     // opcode so the PTEST becomes redundant.
1427     switch (PredOpcode) {
1428     case AArch64::AND_PPzPP:
1429     case AArch64::BIC_PPzPP:
1430     case AArch64::EOR_PPzPP:
1431     case AArch64::NAND_PPzPP:
1432     case AArch64::NOR_PPzPP:
1433     case AArch64::ORN_PPzPP:
1434     case AArch64::ORR_PPzPP:
1435     case AArch64::BRKA_PPzP:
1436     case AArch64::BRKPA_PPzPP:
1437     case AArch64::BRKB_PPzP:
1438     case AArch64::BRKPB_PPzPP:
1439     case AArch64::RDFFR_PPz: {
1440       // Check to see if our mask is the same. If not the resulting flag bits
1441       // may be different and we can't remove the ptest.
1442       auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1443       if (Mask != PredMask)
1444         return false;
1445       break;
1446     }
1447     case AArch64::BRKN_PPzP: {
1448       // BRKN uses an all active implicit mask to set flags unlike the other
1449       // flag-setting instructions.
1450       // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1451       if ((MaskOpcode != AArch64::PTRUE_B) ||
1452           (Mask->getOperand(1).getImm() != 31))
1453         return false;
1454       break;
1455     }
1456     case AArch64::PTRUE_B:
1457       // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1458       break;
1459     default:
1460       // Bail out if we don't recognize the input
1461       return false;
1462     }
1463 
1464     NewOp = convertToFlagSettingOpc(PredOpcode);
1465     OpChanged = true;
1466   }
1467 
1468   const TargetRegisterInfo *TRI = &getRegisterInfo();
1469 
1470   // If another instruction between Pred and PTest accesses flags, don't remove
1471   // the ptest or update the earlier instruction to modify them.
1472   if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1473     return false;
1474 
1475   // If we pass all the checks, it's safe to remove the PTEST and use the flags
1476   // as they are prior to PTEST. Sometimes this requires the tested PTEST
1477   // operand to be replaced with an equivalent instruction that also sets the
1478   // flags.
1479   Pred->setDesc(get(NewOp));
1480   PTest->eraseFromParent();
1481   if (OpChanged) {
1482     bool succeeded = UpdateOperandRegClass(*Pred);
1483     (void)succeeded;
1484     assert(succeeded && "Operands have incompatible register classes!");
1485     Pred->addRegisterDefined(AArch64::NZCV, TRI);
1486   }
1487 
1488   // Ensure that the flags def is live.
1489   if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1490     unsigned i = 0, e = Pred->getNumOperands();
1491     for (; i != e; ++i) {
1492       MachineOperand &MO = Pred->getOperand(i);
1493       if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1494         MO.setIsDead(false);
1495         break;
1496       }
1497     }
1498   }
1499   return true;
1500 }
1501 
1502 /// Try to optimize a compare instruction. A compare instruction is an
1503 /// instruction which produces AArch64::NZCV. It can be truly compare
1504 /// instruction
1505 /// when there are no uses of its destination register.
1506 ///
1507 /// The following steps are tried in order:
1508 /// 1. Convert CmpInstr into an unconditional version.
1509 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1510 ///    condition code or an instruction which can be converted into such an
1511 ///    instruction.
1512 ///    Only comparison with zero is supported.
1513 bool AArch64InstrInfo::optimizeCompareInstr(
1514     MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1515     int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1516   assert(CmpInstr.getParent());
1517   assert(MRI);
1518 
1519   // Replace SUBSWrr with SUBWrr if NZCV is not used.
1520   int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
1521   if (DeadNZCVIdx != -1) {
1522     if (CmpInstr.definesRegister(AArch64::WZR) ||
1523         CmpInstr.definesRegister(AArch64::XZR)) {
1524       CmpInstr.eraseFromParent();
1525       return true;
1526     }
1527     unsigned Opc = CmpInstr.getOpcode();
1528     unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1529     if (NewOpc == Opc)
1530       return false;
1531     const MCInstrDesc &MCID = get(NewOpc);
1532     CmpInstr.setDesc(MCID);
1533     CmpInstr.removeOperand(DeadNZCVIdx);
1534     bool succeeded = UpdateOperandRegClass(CmpInstr);
1535     (void)succeeded;
1536     assert(succeeded && "Some operands reg class are incompatible!");
1537     return true;
1538   }
1539 
1540   if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1541       CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1542     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1543 
1544   if (SrcReg2 != 0)
1545     return false;
1546 
1547   // CmpInstr is a Compare instruction if destination register is not used.
1548   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1549     return false;
1550 
1551   if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1552     return true;
1553   return (CmpValue == 0 || CmpValue == 1) &&
1554          removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1555 }
1556 
1557 /// Get opcode of S version of Instr.
1558 /// If Instr is S version its opcode is returned.
1559 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1560 /// or we are not interested in it.
1561 static unsigned sForm(MachineInstr &Instr) {
1562   switch (Instr.getOpcode()) {
1563   default:
1564     return AArch64::INSTRUCTION_LIST_END;
1565 
1566   case AArch64::ADDSWrr:
1567   case AArch64::ADDSWri:
1568   case AArch64::ADDSXrr:
1569   case AArch64::ADDSXri:
1570   case AArch64::SUBSWrr:
1571   case AArch64::SUBSWri:
1572   case AArch64::SUBSXrr:
1573   case AArch64::SUBSXri:
1574     return Instr.getOpcode();
1575 
1576   case AArch64::ADDWrr:
1577     return AArch64::ADDSWrr;
1578   case AArch64::ADDWri:
1579     return AArch64::ADDSWri;
1580   case AArch64::ADDXrr:
1581     return AArch64::ADDSXrr;
1582   case AArch64::ADDXri:
1583     return AArch64::ADDSXri;
1584   case AArch64::ADCWr:
1585     return AArch64::ADCSWr;
1586   case AArch64::ADCXr:
1587     return AArch64::ADCSXr;
1588   case AArch64::SUBWrr:
1589     return AArch64::SUBSWrr;
1590   case AArch64::SUBWri:
1591     return AArch64::SUBSWri;
1592   case AArch64::SUBXrr:
1593     return AArch64::SUBSXrr;
1594   case AArch64::SUBXri:
1595     return AArch64::SUBSXri;
1596   case AArch64::SBCWr:
1597     return AArch64::SBCSWr;
1598   case AArch64::SBCXr:
1599     return AArch64::SBCSXr;
1600   case AArch64::ANDWri:
1601     return AArch64::ANDSWri;
1602   case AArch64::ANDXri:
1603     return AArch64::ANDSXri;
1604   }
1605 }
1606 
1607 /// Check if AArch64::NZCV should be alive in successors of MBB.
1608 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1609   for (auto *BB : MBB->successors())
1610     if (BB->isLiveIn(AArch64::NZCV))
1611       return true;
1612   return false;
1613 }
1614 
1615 /// \returns The condition code operand index for \p Instr if it is a branch
1616 /// or select and -1 otherwise.
1617 static int
1618 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1619   switch (Instr.getOpcode()) {
1620   default:
1621     return -1;
1622 
1623   case AArch64::Bcc: {
1624     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1625     assert(Idx >= 2);
1626     return Idx - 2;
1627   }
1628 
1629   case AArch64::CSINVWr:
1630   case AArch64::CSINVXr:
1631   case AArch64::CSINCWr:
1632   case AArch64::CSINCXr:
1633   case AArch64::CSELWr:
1634   case AArch64::CSELXr:
1635   case AArch64::CSNEGWr:
1636   case AArch64::CSNEGXr:
1637   case AArch64::FCSELSrrr:
1638   case AArch64::FCSELDrrr: {
1639     int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
1640     assert(Idx >= 1);
1641     return Idx - 1;
1642   }
1643   }
1644 }
1645 
1646 /// Find a condition code used by the instruction.
1647 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1648 /// codes or we don't optimize CmpInstr in the presence of such instructions.
1649 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1650   int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1651   return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1652                           Instr.getOperand(CCIdx).getImm())
1653                     : AArch64CC::Invalid;
1654 }
1655 
1656 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1657   assert(CC != AArch64CC::Invalid);
1658   UsedNZCV UsedFlags;
1659   switch (CC) {
1660   default:
1661     break;
1662 
1663   case AArch64CC::EQ: // Z set
1664   case AArch64CC::NE: // Z clear
1665     UsedFlags.Z = true;
1666     break;
1667 
1668   case AArch64CC::HI: // Z clear and C set
1669   case AArch64CC::LS: // Z set   or  C clear
1670     UsedFlags.Z = true;
1671     [[fallthrough]];
1672   case AArch64CC::HS: // C set
1673   case AArch64CC::LO: // C clear
1674     UsedFlags.C = true;
1675     break;
1676 
1677   case AArch64CC::MI: // N set
1678   case AArch64CC::PL: // N clear
1679     UsedFlags.N = true;
1680     break;
1681 
1682   case AArch64CC::VS: // V set
1683   case AArch64CC::VC: // V clear
1684     UsedFlags.V = true;
1685     break;
1686 
1687   case AArch64CC::GT: // Z clear, N and V the same
1688   case AArch64CC::LE: // Z set,   N and V differ
1689     UsedFlags.Z = true;
1690     [[fallthrough]];
1691   case AArch64CC::GE: // N and V the same
1692   case AArch64CC::LT: // N and V differ
1693     UsedFlags.N = true;
1694     UsedFlags.V = true;
1695     break;
1696   }
1697   return UsedFlags;
1698 }
1699 
1700 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1701 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1702 /// \returns std::nullopt otherwise.
1703 ///
1704 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1705 std::optional<UsedNZCV>
1706 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1707                        const TargetRegisterInfo &TRI,
1708                        SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1709   MachineBasicBlock *CmpParent = CmpInstr.getParent();
1710   if (MI.getParent() != CmpParent)
1711     return std::nullopt;
1712 
1713   if (areCFlagsAliveInSuccessors(CmpParent))
1714     return std::nullopt;
1715 
1716   UsedNZCV NZCVUsedAfterCmp;
1717   for (MachineInstr &Instr : instructionsWithoutDebug(
1718            std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1719     if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1720       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1721       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1722         return std::nullopt;
1723       NZCVUsedAfterCmp |= getUsedNZCV(CC);
1724       if (CCUseInstrs)
1725         CCUseInstrs->push_back(&Instr);
1726     }
1727     if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1728       break;
1729   }
1730   return NZCVUsedAfterCmp;
1731 }
1732 
1733 static bool isADDSRegImm(unsigned Opcode) {
1734   return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1735 }
1736 
1737 static bool isSUBSRegImm(unsigned Opcode) {
1738   return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1739 }
1740 
1741 /// Check if CmpInstr can be substituted by MI.
1742 ///
1743 /// CmpInstr can be substituted:
1744 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1745 /// - and, MI and CmpInstr are from the same MachineBB
1746 /// - and, condition flags are not alive in successors of the CmpInstr parent
1747 /// - and, if MI opcode is the S form there must be no defs of flags between
1748 ///        MI and CmpInstr
1749 ///        or if MI opcode is not the S form there must be neither defs of flags
1750 ///        nor uses of flags between MI and CmpInstr.
1751 /// - and, if C/V flags are not used after CmpInstr
1752 ///        or if N flag is used but MI produces poison value if signed overflow
1753 ///        occurs.
1754 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1755                                        const TargetRegisterInfo &TRI) {
1756   // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1757   // that may or may not set flags.
1758   assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1759 
1760   const unsigned CmpOpcode = CmpInstr.getOpcode();
1761   if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1762     return false;
1763 
1764   assert((CmpInstr.getOperand(2).isImm() &&
1765           CmpInstr.getOperand(2).getImm() == 0) &&
1766          "Caller guarantees that CmpInstr compares with constant 0");
1767 
1768   std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1769   if (!NZVCUsed || NZVCUsed->C)
1770     return false;
1771 
1772   // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1773   // '%vreg = add ...' or '%vreg = sub ...'.
1774   // Condition flag V is used to indicate signed overflow.
1775   // 1) MI and CmpInstr set N and V to the same value.
1776   // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1777   //    signed overflow occurs, so CmpInstr could still be simplified away.
1778   if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1779     return false;
1780 
1781   AccessKind AccessToCheck = AK_Write;
1782   if (sForm(MI) != MI.getOpcode())
1783     AccessToCheck = AK_All;
1784   return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1785 }
1786 
1787 /// Substitute an instruction comparing to zero with another instruction
1788 /// which produces needed condition flags.
1789 ///
1790 /// Return true on success.
1791 bool AArch64InstrInfo::substituteCmpToZero(
1792     MachineInstr &CmpInstr, unsigned SrcReg,
1793     const MachineRegisterInfo &MRI) const {
1794   // Get the unique definition of SrcReg.
1795   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1796   if (!MI)
1797     return false;
1798 
1799   const TargetRegisterInfo &TRI = getRegisterInfo();
1800 
1801   unsigned NewOpc = sForm(*MI);
1802   if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1803     return false;
1804 
1805   if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1806     return false;
1807 
1808   // Update the instruction to set NZCV.
1809   MI->setDesc(get(NewOpc));
1810   CmpInstr.eraseFromParent();
1811   bool succeeded = UpdateOperandRegClass(*MI);
1812   (void)succeeded;
1813   assert(succeeded && "Some operands reg class are incompatible!");
1814   MI->addRegisterDefined(AArch64::NZCV, &TRI);
1815   return true;
1816 }
1817 
1818 /// \returns True if \p CmpInstr can be removed.
1819 ///
1820 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1821 /// codes used in \p CCUseInstrs must be inverted.
1822 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1823                                  int CmpValue, const TargetRegisterInfo &TRI,
1824                                  SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1825                                  bool &IsInvertCC) {
1826   assert((CmpValue == 0 || CmpValue == 1) &&
1827          "Only comparisons to 0 or 1 considered for removal!");
1828 
1829   // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1830   unsigned MIOpc = MI.getOpcode();
1831   if (MIOpc == AArch64::CSINCWr) {
1832     if (MI.getOperand(1).getReg() != AArch64::WZR ||
1833         MI.getOperand(2).getReg() != AArch64::WZR)
1834       return false;
1835   } else if (MIOpc == AArch64::CSINCXr) {
1836     if (MI.getOperand(1).getReg() != AArch64::XZR ||
1837         MI.getOperand(2).getReg() != AArch64::XZR)
1838       return false;
1839   } else {
1840     return false;
1841   }
1842   AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1843   if (MICC == AArch64CC::Invalid)
1844     return false;
1845 
1846   // NZCV needs to be defined
1847   if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
1848     return false;
1849 
1850   // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1851   const unsigned CmpOpcode = CmpInstr.getOpcode();
1852   bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1853   if (CmpValue && !IsSubsRegImm)
1854     return false;
1855   if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1856     return false;
1857 
1858   // MI conditions allowed: eq, ne, mi, pl
1859   UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1860   if (MIUsedNZCV.C || MIUsedNZCV.V)
1861     return false;
1862 
1863   std::optional<UsedNZCV> NZCVUsedAfterCmp =
1864       examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1865   // Condition flags are not used in CmpInstr basic block successors and only
1866   // Z or N flags allowed to be used after CmpInstr within its basic block
1867   if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1868     return false;
1869   // Z or N flag used after CmpInstr must correspond to the flag used in MI
1870   if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1871       (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1872     return false;
1873   // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1874   if (MIUsedNZCV.N && !CmpValue)
1875     return false;
1876 
1877   // There must be no defs of flags between MI and CmpInstr
1878   if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1879     return false;
1880 
1881   // Condition code is inverted in the following cases:
1882   // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1883   // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1884   IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1885                (!CmpValue && MICC == AArch64CC::NE);
1886   return true;
1887 }
1888 
1889 /// Remove comparison in csinc-cmp sequence
1890 ///
1891 /// Examples:
1892 /// 1. \code
1893 ///   csinc w9, wzr, wzr, ne
1894 ///   cmp   w9, #0
1895 ///   b.eq
1896 ///    \endcode
1897 /// to
1898 ///    \code
1899 ///   csinc w9, wzr, wzr, ne
1900 ///   b.ne
1901 ///    \endcode
1902 ///
1903 /// 2. \code
1904 ///   csinc x2, xzr, xzr, mi
1905 ///   cmp   x2, #1
1906 ///   b.pl
1907 ///    \endcode
1908 /// to
1909 ///    \code
1910 ///   csinc x2, xzr, xzr, mi
1911 ///   b.pl
1912 ///    \endcode
1913 ///
1914 /// \param  CmpInstr comparison instruction
1915 /// \return True when comparison removed
1916 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1917     MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1918     const MachineRegisterInfo &MRI) const {
1919   MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1920   if (!MI)
1921     return false;
1922   const TargetRegisterInfo &TRI = getRegisterInfo();
1923   SmallVector<MachineInstr *, 4> CCUseInstrs;
1924   bool IsInvertCC = false;
1925   if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1926                             IsInvertCC))
1927     return false;
1928   // Make transformation
1929   CmpInstr.eraseFromParent();
1930   if (IsInvertCC) {
1931     // Invert condition codes in CmpInstr CC users
1932     for (MachineInstr *CCUseInstr : CCUseInstrs) {
1933       int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1934       assert(Idx >= 0 && "Unexpected instruction using CC.");
1935       MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1936       AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1937           static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1938       CCOperand.setImm(CCUse);
1939     }
1940   }
1941   return true;
1942 }
1943 
1944 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1945   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1946       MI.getOpcode() != AArch64::CATCHRET)
1947     return false;
1948 
1949   MachineBasicBlock &MBB = *MI.getParent();
1950   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1951   auto TRI = Subtarget.getRegisterInfo();
1952   DebugLoc DL = MI.getDebugLoc();
1953 
1954   if (MI.getOpcode() == AArch64::CATCHRET) {
1955     // Skip to the first instruction before the epilog.
1956     const TargetInstrInfo *TII =
1957       MBB.getParent()->getSubtarget().getInstrInfo();
1958     MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1959     auto MBBI = MachineBasicBlock::iterator(MI);
1960     MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1961     while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1962            FirstEpilogSEH != MBB.begin())
1963       FirstEpilogSEH = std::prev(FirstEpilogSEH);
1964     if (FirstEpilogSEH != MBB.begin())
1965       FirstEpilogSEH = std::next(FirstEpilogSEH);
1966     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1967         .addReg(AArch64::X0, RegState::Define)
1968         .addMBB(TargetMBB);
1969     BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1970         .addReg(AArch64::X0, RegState::Define)
1971         .addReg(AArch64::X0)
1972         .addMBB(TargetMBB)
1973         .addImm(0);
1974     return true;
1975   }
1976 
1977   Register Reg = MI.getOperand(0).getReg();
1978   Module &M = *MBB.getParent()->getFunction().getParent();
1979   if (M.getStackProtectorGuard() == "sysreg") {
1980     const AArch64SysReg::SysReg *SrcReg =
1981         AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1982     if (!SrcReg)
1983       report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
1984 
1985     // mrs xN, sysreg
1986     BuildMI(MBB, MI, DL, get(AArch64::MRS))
1987         .addDef(Reg, RegState::Renamable)
1988         .addImm(SrcReg->Encoding);
1989     int Offset = M.getStackProtectorGuardOffset();
1990     if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1991       // ldr xN, [xN, #offset]
1992       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1993           .addDef(Reg)
1994           .addUse(Reg, RegState::Kill)
1995           .addImm(Offset / 8);
1996     } else if (Offset >= -256 && Offset <= 255) {
1997       // ldur xN, [xN, #offset]
1998       BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
1999           .addDef(Reg)
2000           .addUse(Reg, RegState::Kill)
2001           .addImm(Offset);
2002     } else if (Offset >= -4095 && Offset <= 4095) {
2003       if (Offset > 0) {
2004         // add xN, xN, #offset
2005         BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2006             .addDef(Reg)
2007             .addUse(Reg, RegState::Kill)
2008             .addImm(Offset)
2009             .addImm(0);
2010       } else {
2011         // sub xN, xN, #offset
2012         BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2013             .addDef(Reg)
2014             .addUse(Reg, RegState::Kill)
2015             .addImm(-Offset)
2016             .addImm(0);
2017       }
2018       // ldr xN, [xN]
2019       BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2020           .addDef(Reg)
2021           .addUse(Reg, RegState::Kill)
2022           .addImm(0);
2023     } else {
2024       // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2025       // than 23760.
2026       // It might be nice to use AArch64::MOVi32imm here, which would get
2027       // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2028       // contains the MRS result. findScratchNonCalleeSaveRegister() in
2029       // AArch64FrameLowering might help us find such a scratch register
2030       // though. If we failed to find a scratch register, we could emit a
2031       // stream of add instructions to build up the immediate. Or, we could try
2032       // to insert a AArch64::MOVi32imm before register allocation so that we
2033       // didn't need to scavenge for a scratch register.
2034       report_fatal_error("Unable to encode Stack Protector Guard Offset");
2035     }
2036     MBB.erase(MI);
2037     return true;
2038   }
2039 
2040   const GlobalValue *GV =
2041       cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2042   const TargetMachine &TM = MBB.getParent()->getTarget();
2043   unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2044   const unsigned char MO_NC = AArch64II::MO_NC;
2045 
2046   if ((OpFlags & AArch64II::MO_GOT) != 0) {
2047     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2048         .addGlobalAddress(GV, 0, OpFlags);
2049     if (Subtarget.isTargetILP32()) {
2050       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2051       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2052           .addDef(Reg32, RegState::Dead)
2053           .addUse(Reg, RegState::Kill)
2054           .addImm(0)
2055           .addMemOperand(*MI.memoperands_begin())
2056           .addDef(Reg, RegState::Implicit);
2057     } else {
2058       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2059           .addReg(Reg, RegState::Kill)
2060           .addImm(0)
2061           .addMemOperand(*MI.memoperands_begin());
2062     }
2063   } else if (TM.getCodeModel() == CodeModel::Large) {
2064     assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2065     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2066         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2067         .addImm(0);
2068     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2069         .addReg(Reg, RegState::Kill)
2070         .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2071         .addImm(16);
2072     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2073         .addReg(Reg, RegState::Kill)
2074         .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2075         .addImm(32);
2076     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2077         .addReg(Reg, RegState::Kill)
2078         .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2079         .addImm(48);
2080     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2081         .addReg(Reg, RegState::Kill)
2082         .addImm(0)
2083         .addMemOperand(*MI.memoperands_begin());
2084   } else if (TM.getCodeModel() == CodeModel::Tiny) {
2085     BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2086         .addGlobalAddress(GV, 0, OpFlags);
2087   } else {
2088     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2089         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2090     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2091     if (Subtarget.isTargetILP32()) {
2092       unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2093       BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2094           .addDef(Reg32, RegState::Dead)
2095           .addUse(Reg, RegState::Kill)
2096           .addGlobalAddress(GV, 0, LoFlags)
2097           .addMemOperand(*MI.memoperands_begin())
2098           .addDef(Reg, RegState::Implicit);
2099     } else {
2100       BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2101           .addReg(Reg, RegState::Kill)
2102           .addGlobalAddress(GV, 0, LoFlags)
2103           .addMemOperand(*MI.memoperands_begin());
2104     }
2105   }
2106 
2107   MBB.erase(MI);
2108 
2109   return true;
2110 }
2111 
2112 // Return true if this instruction simply sets its single destination register
2113 // to zero. This is equivalent to a register rename of the zero-register.
2114 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2115   switch (MI.getOpcode()) {
2116   default:
2117     break;
2118   case AArch64::MOVZWi:
2119   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2120     if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2121       assert(MI.getDesc().getNumOperands() == 3 &&
2122              MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2123       return true;
2124     }
2125     break;
2126   case AArch64::ANDWri: // and Rd, Rzr, #imm
2127     return MI.getOperand(1).getReg() == AArch64::WZR;
2128   case AArch64::ANDXri:
2129     return MI.getOperand(1).getReg() == AArch64::XZR;
2130   case TargetOpcode::COPY:
2131     return MI.getOperand(1).getReg() == AArch64::WZR;
2132   }
2133   return false;
2134 }
2135 
2136 // Return true if this instruction simply renames a general register without
2137 // modifying bits.
2138 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2139   switch (MI.getOpcode()) {
2140   default:
2141     break;
2142   case TargetOpcode::COPY: {
2143     // GPR32 copies will by lowered to ORRXrs
2144     Register DstReg = MI.getOperand(0).getReg();
2145     return (AArch64::GPR32RegClass.contains(DstReg) ||
2146             AArch64::GPR64RegClass.contains(DstReg));
2147   }
2148   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2149     if (MI.getOperand(1).getReg() == AArch64::XZR) {
2150       assert(MI.getDesc().getNumOperands() == 4 &&
2151              MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2152       return true;
2153     }
2154     break;
2155   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2156     if (MI.getOperand(2).getImm() == 0) {
2157       assert(MI.getDesc().getNumOperands() == 4 &&
2158              MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2159       return true;
2160     }
2161     break;
2162   }
2163   return false;
2164 }
2165 
2166 // Return true if this instruction simply renames a general register without
2167 // modifying bits.
2168 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2169   switch (MI.getOpcode()) {
2170   default:
2171     break;
2172   case TargetOpcode::COPY: {
2173     Register DstReg = MI.getOperand(0).getReg();
2174     return AArch64::FPR128RegClass.contains(DstReg);
2175   }
2176   case AArch64::ORRv16i8:
2177     if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2178       assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2179              "invalid ORRv16i8 operands");
2180       return true;
2181     }
2182     break;
2183   }
2184   return false;
2185 }
2186 
2187 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2188                                                int &FrameIndex) const {
2189   switch (MI.getOpcode()) {
2190   default:
2191     break;
2192   case AArch64::LDRWui:
2193   case AArch64::LDRXui:
2194   case AArch64::LDRBui:
2195   case AArch64::LDRHui:
2196   case AArch64::LDRSui:
2197   case AArch64::LDRDui:
2198   case AArch64::LDRQui:
2199   case AArch64::LDR_PXI:
2200     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2201         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2202       FrameIndex = MI.getOperand(1).getIndex();
2203       return MI.getOperand(0).getReg();
2204     }
2205     break;
2206   }
2207 
2208   return 0;
2209 }
2210 
2211 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2212                                               int &FrameIndex) const {
2213   switch (MI.getOpcode()) {
2214   default:
2215     break;
2216   case AArch64::STRWui:
2217   case AArch64::STRXui:
2218   case AArch64::STRBui:
2219   case AArch64::STRHui:
2220   case AArch64::STRSui:
2221   case AArch64::STRDui:
2222   case AArch64::STRQui:
2223   case AArch64::STR_PXI:
2224     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2225         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2226       FrameIndex = MI.getOperand(1).getIndex();
2227       return MI.getOperand(0).getReg();
2228     }
2229     break;
2230   }
2231   return 0;
2232 }
2233 
2234 /// Check all MachineMemOperands for a hint to suppress pairing.
2235 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2236   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2237     return MMO->getFlags() & MOSuppressPair;
2238   });
2239 }
2240 
2241 /// Set a flag on the first MachineMemOperand to suppress pairing.
2242 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2243   if (MI.memoperands_empty())
2244     return;
2245   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2246 }
2247 
2248 /// Check all MachineMemOperands for a hint that the load/store is strided.
2249 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2250   return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2251     return MMO->getFlags() & MOStridedAccess;
2252   });
2253 }
2254 
2255 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2256   switch (Opc) {
2257   default:
2258     return false;
2259   case AArch64::STURSi:
2260   case AArch64::STRSpre:
2261   case AArch64::STURDi:
2262   case AArch64::STRDpre:
2263   case AArch64::STURQi:
2264   case AArch64::STRQpre:
2265   case AArch64::STURBBi:
2266   case AArch64::STURHHi:
2267   case AArch64::STURWi:
2268   case AArch64::STRWpre:
2269   case AArch64::STURXi:
2270   case AArch64::STRXpre:
2271   case AArch64::LDURSi:
2272   case AArch64::LDRSpre:
2273   case AArch64::LDURDi:
2274   case AArch64::LDRDpre:
2275   case AArch64::LDURQi:
2276   case AArch64::LDRQpre:
2277   case AArch64::LDURWi:
2278   case AArch64::LDRWpre:
2279   case AArch64::LDURXi:
2280   case AArch64::LDRXpre:
2281   case AArch64::LDRSWpre:
2282   case AArch64::LDURSWi:
2283   case AArch64::LDURHHi:
2284   case AArch64::LDURBBi:
2285   case AArch64::LDURSBWi:
2286   case AArch64::LDURSHWi:
2287     return true;
2288   }
2289 }
2290 
2291 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2292   switch (Opc) {
2293   default: return {};
2294   case AArch64::PRFMui: return AArch64::PRFUMi;
2295   case AArch64::LDRXui: return AArch64::LDURXi;
2296   case AArch64::LDRWui: return AArch64::LDURWi;
2297   case AArch64::LDRBui: return AArch64::LDURBi;
2298   case AArch64::LDRHui: return AArch64::LDURHi;
2299   case AArch64::LDRSui: return AArch64::LDURSi;
2300   case AArch64::LDRDui: return AArch64::LDURDi;
2301   case AArch64::LDRQui: return AArch64::LDURQi;
2302   case AArch64::LDRBBui: return AArch64::LDURBBi;
2303   case AArch64::LDRHHui: return AArch64::LDURHHi;
2304   case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2305   case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2306   case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2307   case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2308   case AArch64::LDRSWui: return AArch64::LDURSWi;
2309   case AArch64::STRXui: return AArch64::STURXi;
2310   case AArch64::STRWui: return AArch64::STURWi;
2311   case AArch64::STRBui: return AArch64::STURBi;
2312   case AArch64::STRHui: return AArch64::STURHi;
2313   case AArch64::STRSui: return AArch64::STURSi;
2314   case AArch64::STRDui: return AArch64::STURDi;
2315   case AArch64::STRQui: return AArch64::STURQi;
2316   case AArch64::STRBBui: return AArch64::STURBBi;
2317   case AArch64::STRHHui: return AArch64::STURHHi;
2318   }
2319 }
2320 
2321 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2322   switch (Opc) {
2323   default:
2324     return 2;
2325   case AArch64::LDPXi:
2326   case AArch64::LDPDi:
2327   case AArch64::STPXi:
2328   case AArch64::STPDi:
2329   case AArch64::LDNPXi:
2330   case AArch64::LDNPDi:
2331   case AArch64::STNPXi:
2332   case AArch64::STNPDi:
2333   case AArch64::LDPQi:
2334   case AArch64::STPQi:
2335   case AArch64::LDNPQi:
2336   case AArch64::STNPQi:
2337   case AArch64::LDPWi:
2338   case AArch64::LDPSi:
2339   case AArch64::STPWi:
2340   case AArch64::STPSi:
2341   case AArch64::LDNPWi:
2342   case AArch64::LDNPSi:
2343   case AArch64::STNPWi:
2344   case AArch64::STNPSi:
2345   case AArch64::LDG:
2346   case AArch64::STGPi:
2347 
2348   case AArch64::LD1B_IMM:
2349   case AArch64::LD1B_H_IMM:
2350   case AArch64::LD1B_S_IMM:
2351   case AArch64::LD1B_D_IMM:
2352   case AArch64::LD1SB_H_IMM:
2353   case AArch64::LD1SB_S_IMM:
2354   case AArch64::LD1SB_D_IMM:
2355   case AArch64::LD1H_IMM:
2356   case AArch64::LD1H_S_IMM:
2357   case AArch64::LD1H_D_IMM:
2358   case AArch64::LD1SH_S_IMM:
2359   case AArch64::LD1SH_D_IMM:
2360   case AArch64::LD1W_IMM:
2361   case AArch64::LD1W_D_IMM:
2362   case AArch64::LD1SW_D_IMM:
2363   case AArch64::LD1D_IMM:
2364 
2365   case AArch64::LD2B_IMM:
2366   case AArch64::LD2H_IMM:
2367   case AArch64::LD2W_IMM:
2368   case AArch64::LD2D_IMM:
2369   case AArch64::LD3B_IMM:
2370   case AArch64::LD3H_IMM:
2371   case AArch64::LD3W_IMM:
2372   case AArch64::LD3D_IMM:
2373   case AArch64::LD4B_IMM:
2374   case AArch64::LD4H_IMM:
2375   case AArch64::LD4W_IMM:
2376   case AArch64::LD4D_IMM:
2377 
2378   case AArch64::ST1B_IMM:
2379   case AArch64::ST1B_H_IMM:
2380   case AArch64::ST1B_S_IMM:
2381   case AArch64::ST1B_D_IMM:
2382   case AArch64::ST1H_IMM:
2383   case AArch64::ST1H_S_IMM:
2384   case AArch64::ST1H_D_IMM:
2385   case AArch64::ST1W_IMM:
2386   case AArch64::ST1W_D_IMM:
2387   case AArch64::ST1D_IMM:
2388 
2389   case AArch64::ST2B_IMM:
2390   case AArch64::ST2H_IMM:
2391   case AArch64::ST2W_IMM:
2392   case AArch64::ST2D_IMM:
2393   case AArch64::ST3B_IMM:
2394   case AArch64::ST3H_IMM:
2395   case AArch64::ST3W_IMM:
2396   case AArch64::ST3D_IMM:
2397   case AArch64::ST4B_IMM:
2398   case AArch64::ST4H_IMM:
2399   case AArch64::ST4W_IMM:
2400   case AArch64::ST4D_IMM:
2401 
2402   case AArch64::LD1RB_IMM:
2403   case AArch64::LD1RB_H_IMM:
2404   case AArch64::LD1RB_S_IMM:
2405   case AArch64::LD1RB_D_IMM:
2406   case AArch64::LD1RSB_H_IMM:
2407   case AArch64::LD1RSB_S_IMM:
2408   case AArch64::LD1RSB_D_IMM:
2409   case AArch64::LD1RH_IMM:
2410   case AArch64::LD1RH_S_IMM:
2411   case AArch64::LD1RH_D_IMM:
2412   case AArch64::LD1RSH_S_IMM:
2413   case AArch64::LD1RSH_D_IMM:
2414   case AArch64::LD1RW_IMM:
2415   case AArch64::LD1RW_D_IMM:
2416   case AArch64::LD1RSW_IMM:
2417   case AArch64::LD1RD_IMM:
2418 
2419   case AArch64::LDNT1B_ZRI:
2420   case AArch64::LDNT1H_ZRI:
2421   case AArch64::LDNT1W_ZRI:
2422   case AArch64::LDNT1D_ZRI:
2423   case AArch64::STNT1B_ZRI:
2424   case AArch64::STNT1H_ZRI:
2425   case AArch64::STNT1W_ZRI:
2426   case AArch64::STNT1D_ZRI:
2427 
2428   case AArch64::LDNF1B_IMM:
2429   case AArch64::LDNF1B_H_IMM:
2430   case AArch64::LDNF1B_S_IMM:
2431   case AArch64::LDNF1B_D_IMM:
2432   case AArch64::LDNF1SB_H_IMM:
2433   case AArch64::LDNF1SB_S_IMM:
2434   case AArch64::LDNF1SB_D_IMM:
2435   case AArch64::LDNF1H_IMM:
2436   case AArch64::LDNF1H_S_IMM:
2437   case AArch64::LDNF1H_D_IMM:
2438   case AArch64::LDNF1SH_S_IMM:
2439   case AArch64::LDNF1SH_D_IMM:
2440   case AArch64::LDNF1W_IMM:
2441   case AArch64::LDNF1W_D_IMM:
2442   case AArch64::LDNF1SW_D_IMM:
2443   case AArch64::LDNF1D_IMM:
2444     return 3;
2445   case AArch64::ADDG:
2446   case AArch64::STGi:
2447   case AArch64::LDR_PXI:
2448   case AArch64::STR_PXI:
2449     return 2;
2450   }
2451 }
2452 
2453 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2454   switch (MI.getOpcode()) {
2455   default:
2456     return false;
2457   // Scaled instructions.
2458   case AArch64::STRSui:
2459   case AArch64::STRDui:
2460   case AArch64::STRQui:
2461   case AArch64::STRXui:
2462   case AArch64::STRWui:
2463   case AArch64::LDRSui:
2464   case AArch64::LDRDui:
2465   case AArch64::LDRQui:
2466   case AArch64::LDRXui:
2467   case AArch64::LDRWui:
2468   case AArch64::LDRSWui:
2469   // Unscaled instructions.
2470   case AArch64::STURSi:
2471   case AArch64::STRSpre:
2472   case AArch64::STURDi:
2473   case AArch64::STRDpre:
2474   case AArch64::STURQi:
2475   case AArch64::STRQpre:
2476   case AArch64::STURWi:
2477   case AArch64::STRWpre:
2478   case AArch64::STURXi:
2479   case AArch64::STRXpre:
2480   case AArch64::LDURSi:
2481   case AArch64::LDRSpre:
2482   case AArch64::LDURDi:
2483   case AArch64::LDRDpre:
2484   case AArch64::LDURQi:
2485   case AArch64::LDRQpre:
2486   case AArch64::LDURWi:
2487   case AArch64::LDRWpre:
2488   case AArch64::LDURXi:
2489   case AArch64::LDRXpre:
2490   case AArch64::LDURSWi:
2491   case AArch64::LDRSWpre:
2492     return true;
2493   }
2494 }
2495 
2496 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2497   switch (MI.getOpcode()) {
2498   default:
2499     assert((!MI.isCall() || !MI.isReturn()) &&
2500            "Unexpected instruction - was a new tail call opcode introduced?");
2501     return false;
2502   case AArch64::TCRETURNdi:
2503   case AArch64::TCRETURNri:
2504   case AArch64::TCRETURNriBTI:
2505   case AArch64::TCRETURNriALL:
2506     return true;
2507   }
2508 }
2509 
2510 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2511   switch (Opc) {
2512   default:
2513     llvm_unreachable("Opcode has no flag setting equivalent!");
2514   // 32-bit cases:
2515   case AArch64::ADDWri:
2516     return AArch64::ADDSWri;
2517   case AArch64::ADDWrr:
2518     return AArch64::ADDSWrr;
2519   case AArch64::ADDWrs:
2520     return AArch64::ADDSWrs;
2521   case AArch64::ADDWrx:
2522     return AArch64::ADDSWrx;
2523   case AArch64::ANDWri:
2524     return AArch64::ANDSWri;
2525   case AArch64::ANDWrr:
2526     return AArch64::ANDSWrr;
2527   case AArch64::ANDWrs:
2528     return AArch64::ANDSWrs;
2529   case AArch64::BICWrr:
2530     return AArch64::BICSWrr;
2531   case AArch64::BICWrs:
2532     return AArch64::BICSWrs;
2533   case AArch64::SUBWri:
2534     return AArch64::SUBSWri;
2535   case AArch64::SUBWrr:
2536     return AArch64::SUBSWrr;
2537   case AArch64::SUBWrs:
2538     return AArch64::SUBSWrs;
2539   case AArch64::SUBWrx:
2540     return AArch64::SUBSWrx;
2541   // 64-bit cases:
2542   case AArch64::ADDXri:
2543     return AArch64::ADDSXri;
2544   case AArch64::ADDXrr:
2545     return AArch64::ADDSXrr;
2546   case AArch64::ADDXrs:
2547     return AArch64::ADDSXrs;
2548   case AArch64::ADDXrx:
2549     return AArch64::ADDSXrx;
2550   case AArch64::ANDXri:
2551     return AArch64::ANDSXri;
2552   case AArch64::ANDXrr:
2553     return AArch64::ANDSXrr;
2554   case AArch64::ANDXrs:
2555     return AArch64::ANDSXrs;
2556   case AArch64::BICXrr:
2557     return AArch64::BICSXrr;
2558   case AArch64::BICXrs:
2559     return AArch64::BICSXrs;
2560   case AArch64::SUBXri:
2561     return AArch64::SUBSXri;
2562   case AArch64::SUBXrr:
2563     return AArch64::SUBSXrr;
2564   case AArch64::SUBXrs:
2565     return AArch64::SUBSXrs;
2566   case AArch64::SUBXrx:
2567     return AArch64::SUBSXrx;
2568   // SVE instructions:
2569   case AArch64::AND_PPzPP:
2570     return AArch64::ANDS_PPzPP;
2571   case AArch64::BIC_PPzPP:
2572     return AArch64::BICS_PPzPP;
2573   case AArch64::EOR_PPzPP:
2574     return AArch64::EORS_PPzPP;
2575   case AArch64::NAND_PPzPP:
2576     return AArch64::NANDS_PPzPP;
2577   case AArch64::NOR_PPzPP:
2578     return AArch64::NORS_PPzPP;
2579   case AArch64::ORN_PPzPP:
2580     return AArch64::ORNS_PPzPP;
2581   case AArch64::ORR_PPzPP:
2582     return AArch64::ORRS_PPzPP;
2583   case AArch64::BRKA_PPzP:
2584     return AArch64::BRKAS_PPzP;
2585   case AArch64::BRKPA_PPzPP:
2586     return AArch64::BRKPAS_PPzPP;
2587   case AArch64::BRKB_PPzP:
2588     return AArch64::BRKBS_PPzP;
2589   case AArch64::BRKPB_PPzPP:
2590     return AArch64::BRKPBS_PPzPP;
2591   case AArch64::BRKN_PPzP:
2592     return AArch64::BRKNS_PPzP;
2593   case AArch64::RDFFR_PPz:
2594     return AArch64::RDFFRS_PPz;
2595   case AArch64::PTRUE_B:
2596     return AArch64::PTRUES_B;
2597   }
2598 }
2599 
2600 // Is this a candidate for ld/st merging or pairing?  For example, we don't
2601 // touch volatiles or load/stores that have a hint to avoid pair formation.
2602 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2603 
2604   bool IsPreLdSt = isPreLdSt(MI);
2605 
2606   // If this is a volatile load/store, don't mess with it.
2607   if (MI.hasOrderedMemoryRef())
2608     return false;
2609 
2610   // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2611   // For Pre-inc LD/ST, the operand is shifted by one.
2612   assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2613           MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2614          "Expected a reg or frame index operand.");
2615 
2616   // For Pre-indexed addressing quadword instructions, the third operand is the
2617   // immediate value.
2618   bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2619 
2620   if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2621     return false;
2622 
2623   // Can't merge/pair if the instruction modifies the base register.
2624   // e.g., ldr x0, [x0]
2625   // This case will never occur with an FI base.
2626   // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2627   // STR<S,D,Q,W,X>pre, it can be merged.
2628   // For example:
2629   //   ldr q0, [x11, #32]!
2630   //   ldr q1, [x11, #16]
2631   //   to
2632   //   ldp q0, q1, [x11, #32]!
2633   if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2634     Register BaseReg = MI.getOperand(1).getReg();
2635     const TargetRegisterInfo *TRI = &getRegisterInfo();
2636     if (MI.modifiesRegister(BaseReg, TRI))
2637       return false;
2638   }
2639 
2640   // Check if this load/store has a hint to avoid pair formation.
2641   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2642   if (isLdStPairSuppressed(MI))
2643     return false;
2644 
2645   // Do not pair any callee-save store/reload instructions in the
2646   // prologue/epilogue if the CFI information encoded the operations as separate
2647   // instructions, as that will cause the size of the actual prologue to mismatch
2648   // with the prologue size recorded in the Windows CFI.
2649   const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2650   bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2651                      MI.getMF()->getFunction().needsUnwindTableEntry();
2652   if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2653                       MI.getFlag(MachineInstr::FrameDestroy)))
2654     return false;
2655 
2656   // On some CPUs quad load/store pairs are slower than two single load/stores.
2657   if (Subtarget.isPaired128Slow()) {
2658     switch (MI.getOpcode()) {
2659     default:
2660       break;
2661     case AArch64::LDURQi:
2662     case AArch64::STURQi:
2663     case AArch64::LDRQui:
2664     case AArch64::STRQui:
2665       return false;
2666     }
2667   }
2668 
2669   return true;
2670 }
2671 
2672 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2673     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2674     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
2675     const TargetRegisterInfo *TRI) const {
2676   if (!LdSt.mayLoadOrStore())
2677     return false;
2678 
2679   const MachineOperand *BaseOp;
2680   TypeSize WidthN(0, false);
2681   if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2682                                     WidthN, TRI))
2683     return false;
2684   // The maximum vscale is 16 under AArch64, return the maximal extent for the
2685   // vector.
2686   Width = WidthN.isScalable()
2687               ? WidthN.getKnownMinValue() * AArch64::SVEMaxBitsPerVector /
2688                     AArch64::SVEBitsPerBlock
2689               : WidthN.getKnownMinValue();
2690   BaseOps.push_back(BaseOp);
2691   return true;
2692 }
2693 
2694 std::optional<ExtAddrMode>
2695 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2696                                           const TargetRegisterInfo *TRI) const {
2697   const MachineOperand *Base; // Filled with the base operand of MI.
2698   int64_t Offset;             // Filled with the offset of MI.
2699   bool OffsetIsScalable;
2700   if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2701     return std::nullopt;
2702 
2703   if (!Base->isReg())
2704     return std::nullopt;
2705   ExtAddrMode AM;
2706   AM.BaseReg = Base->getReg();
2707   AM.Displacement = Offset;
2708   AM.ScaledReg = 0;
2709   AM.Scale = 0;
2710   return AM;
2711 }
2712 
2713 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2714                                            Register Reg,
2715                                            const MachineInstr &AddrI,
2716                                            ExtAddrMode &AM) const {
2717   // Filter out instructions into which we cannot fold.
2718   unsigned NumBytes;
2719   int64_t OffsetScale = 1;
2720   switch (MemI.getOpcode()) {
2721   default:
2722     return false;
2723 
2724   case AArch64::LDURQi:
2725   case AArch64::STURQi:
2726     NumBytes = 16;
2727     break;
2728 
2729   case AArch64::LDURDi:
2730   case AArch64::STURDi:
2731   case AArch64::LDURXi:
2732   case AArch64::STURXi:
2733     NumBytes = 8;
2734     break;
2735 
2736   case AArch64::LDURWi:
2737   case AArch64::LDURSWi:
2738   case AArch64::STURWi:
2739     NumBytes = 4;
2740     break;
2741 
2742   case AArch64::LDURHi:
2743   case AArch64::STURHi:
2744   case AArch64::LDURHHi:
2745   case AArch64::STURHHi:
2746   case AArch64::LDURSHXi:
2747   case AArch64::LDURSHWi:
2748     NumBytes = 2;
2749     break;
2750 
2751   case AArch64::LDRBroX:
2752   case AArch64::LDRBBroX:
2753   case AArch64::LDRSBXroX:
2754   case AArch64::LDRSBWroX:
2755   case AArch64::STRBroX:
2756   case AArch64::STRBBroX:
2757   case AArch64::LDURBi:
2758   case AArch64::LDURBBi:
2759   case AArch64::LDURSBXi:
2760   case AArch64::LDURSBWi:
2761   case AArch64::STURBi:
2762   case AArch64::STURBBi:
2763   case AArch64::LDRBui:
2764   case AArch64::LDRBBui:
2765   case AArch64::LDRSBXui:
2766   case AArch64::LDRSBWui:
2767   case AArch64::STRBui:
2768   case AArch64::STRBBui:
2769     NumBytes = 1;
2770     break;
2771 
2772   case AArch64::LDRQroX:
2773   case AArch64::STRQroX:
2774   case AArch64::LDRQui:
2775   case AArch64::STRQui:
2776     NumBytes = 16;
2777     OffsetScale = 16;
2778     break;
2779 
2780   case AArch64::LDRDroX:
2781   case AArch64::STRDroX:
2782   case AArch64::LDRXroX:
2783   case AArch64::STRXroX:
2784   case AArch64::LDRDui:
2785   case AArch64::STRDui:
2786   case AArch64::LDRXui:
2787   case AArch64::STRXui:
2788     NumBytes = 8;
2789     OffsetScale = 8;
2790     break;
2791 
2792   case AArch64::LDRWroX:
2793   case AArch64::LDRSWroX:
2794   case AArch64::STRWroX:
2795   case AArch64::LDRWui:
2796   case AArch64::LDRSWui:
2797   case AArch64::STRWui:
2798     NumBytes = 4;
2799     OffsetScale = 4;
2800     break;
2801 
2802   case AArch64::LDRHroX:
2803   case AArch64::STRHroX:
2804   case AArch64::LDRHHroX:
2805   case AArch64::STRHHroX:
2806   case AArch64::LDRSHXroX:
2807   case AArch64::LDRSHWroX:
2808   case AArch64::LDRHui:
2809   case AArch64::STRHui:
2810   case AArch64::LDRHHui:
2811   case AArch64::STRHHui:
2812   case AArch64::LDRSHXui:
2813   case AArch64::LDRSHWui:
2814     NumBytes = 2;
2815     OffsetScale = 2;
2816     break;
2817   }
2818 
2819   // Check the fold operand is not the loaded/stored value.
2820   const MachineOperand &BaseRegOp = MemI.getOperand(0);
2821   if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2822     return false;
2823 
2824   // Handle memory instructions with a [Reg, Reg] addressing mode.
2825   if (MemI.getOperand(2).isReg()) {
2826     // Bail if the addressing mode already includes extension of the offset
2827     // register.
2828     if (MemI.getOperand(3).getImm())
2829       return false;
2830 
2831     // Check if we actually have a scaled offset.
2832     if (MemI.getOperand(4).getImm() == 0)
2833       OffsetScale = 1;
2834 
2835     // If the address instructions is folded into the base register, then the
2836     // addressing mode must not have a scale. Then we can swap the base and the
2837     // scaled registers.
2838     if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2839       return false;
2840 
2841     switch (AddrI.getOpcode()) {
2842     default:
2843       return false;
2844 
2845     case AArch64::SBFMXri:
2846       // sxtw Xa, Wm
2847       // ldr Xd, [Xn, Xa, lsl #N]
2848       // ->
2849       // ldr Xd, [Xn, Wm, sxtw #N]
2850       if (AddrI.getOperand(2).getImm() != 0 ||
2851           AddrI.getOperand(3).getImm() != 31)
2852         return false;
2853 
2854       AM.BaseReg = MemI.getOperand(1).getReg();
2855       if (AM.BaseReg == Reg)
2856         AM.BaseReg = MemI.getOperand(2).getReg();
2857       AM.ScaledReg = AddrI.getOperand(1).getReg();
2858       AM.Scale = OffsetScale;
2859       AM.Displacement = 0;
2860       AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2861       return true;
2862 
2863     case TargetOpcode::SUBREG_TO_REG: {
2864       // mov Wa, Wm
2865       // ldr Xd, [Xn, Xa, lsl #N]
2866       // ->
2867       // ldr Xd, [Xn, Wm, uxtw #N]
2868 
2869       // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2870       if (AddrI.getOperand(1).getImm() != 0 ||
2871           AddrI.getOperand(3).getImm() != AArch64::sub_32)
2872         return false;
2873 
2874       const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2875       Register OffsetReg = AddrI.getOperand(2).getReg();
2876       if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2877         return false;
2878 
2879       const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2880       if (DefMI.getOpcode() != AArch64::ORRWrs ||
2881           DefMI.getOperand(1).getReg() != AArch64::WZR ||
2882           DefMI.getOperand(3).getImm() != 0)
2883         return false;
2884 
2885       AM.BaseReg = MemI.getOperand(1).getReg();
2886       if (AM.BaseReg == Reg)
2887         AM.BaseReg = MemI.getOperand(2).getReg();
2888       AM.ScaledReg = DefMI.getOperand(2).getReg();
2889       AM.Scale = OffsetScale;
2890       AM.Displacement = 0;
2891       AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2892       return true;
2893     }
2894     }
2895   }
2896 
2897   // Handle memory instructions with a [Reg, #Imm] addressing mode.
2898 
2899   // Check we are not breaking a potential conversion to an LDP.
2900   auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2901                                  int64_t NewOffset) -> bool {
2902     int64_t MinOffset, MaxOffset;
2903     switch (NumBytes) {
2904     default:
2905       return true;
2906     case 4:
2907       MinOffset = -256;
2908       MaxOffset = 252;
2909       break;
2910     case 8:
2911       MinOffset = -512;
2912       MaxOffset = 504;
2913       break;
2914     case 16:
2915       MinOffset = -1024;
2916       MaxOffset = 1008;
2917       break;
2918     }
2919     return OldOffset < MinOffset || OldOffset > MaxOffset ||
2920            (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2921   };
2922   auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2923     int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2924     int64_t NewOffset = OldOffset + Disp;
2925     if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2926       return false;
2927     // If the old offset would fit into an LDP, but the new offset wouldn't,
2928     // bail out.
2929     if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2930       return false;
2931     AM.BaseReg = AddrI.getOperand(1).getReg();
2932     AM.ScaledReg = 0;
2933     AM.Scale = 0;
2934     AM.Displacement = NewOffset;
2935     AM.Form = ExtAddrMode::Formula::Basic;
2936     return true;
2937   };
2938 
2939   auto canFoldAddRegIntoAddrMode =
2940       [&](int64_t Scale,
2941           ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2942     if (MemI.getOperand(2).getImm() != 0)
2943       return false;
2944     if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2945       return false;
2946     AM.BaseReg = AddrI.getOperand(1).getReg();
2947     AM.ScaledReg = AddrI.getOperand(2).getReg();
2948     AM.Scale = Scale;
2949     AM.Displacement = 0;
2950     AM.Form = Form;
2951     return true;
2952   };
2953 
2954   auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2955     unsigned Opcode = MemI.getOpcode();
2956     return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2957            Subtarget.isSTRQroSlow();
2958   };
2959 
2960   int64_t Disp = 0;
2961   const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2962   switch (AddrI.getOpcode()) {
2963   default:
2964     return false;
2965 
2966   case AArch64::ADDXri:
2967     // add Xa, Xn, #N
2968     // ldr Xd, [Xa, #M]
2969     // ->
2970     // ldr Xd, [Xn, #N'+M]
2971     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2972     return canFoldAddSubImmIntoAddrMode(Disp);
2973 
2974   case AArch64::SUBXri:
2975     // sub Xa, Xn, #N
2976     // ldr Xd, [Xa, #M]
2977     // ->
2978     // ldr Xd, [Xn, #N'+M]
2979     Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2980     return canFoldAddSubImmIntoAddrMode(-Disp);
2981 
2982   case AArch64::ADDXrs: {
2983     // add Xa, Xn, Xm, lsl #N
2984     // ldr Xd, [Xa]
2985     // ->
2986     // ldr Xd, [Xn, Xm, lsl #N]
2987 
2988     // Don't fold the add if the result would be slower, unless optimising for
2989     // size.
2990     unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
2991     if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
2992       return false;
2993     Shift = AArch64_AM::getShiftValue(Shift);
2994     if (!OptSize) {
2995       if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast())
2996         return false;
2997       if (avoidSlowSTRQ(MemI))
2998         return false;
2999     }
3000     return canFoldAddRegIntoAddrMode(1ULL << Shift);
3001   }
3002 
3003   case AArch64::ADDXrr:
3004     // add Xa, Xn, Xm
3005     // ldr Xd, [Xa]
3006     // ->
3007     // ldr Xd, [Xn, Xm, lsl #0]
3008 
3009     // Don't fold the add if the result would be slower, unless optimising for
3010     // size.
3011     if (!OptSize && avoidSlowSTRQ(MemI))
3012       return false;
3013     return canFoldAddRegIntoAddrMode(1);
3014 
3015   case AArch64::ADDXrx:
3016     // add Xa, Xn, Wm, {s,u}xtw #N
3017     // ldr Xd, [Xa]
3018     // ->
3019     // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3020 
3021     // Don't fold the add if the result would be slower, unless optimising for
3022     // size.
3023     if (!OptSize && avoidSlowSTRQ(MemI))
3024       return false;
3025 
3026     // Can fold only sign-/zero-extend of a word.
3027     unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3028     AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3029     if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3030       return false;
3031 
3032     return canFoldAddRegIntoAddrMode(
3033         1ULL << AArch64_AM::getArithShiftValue(Imm),
3034         (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3035                                      : ExtAddrMode::Formula::ZExtScaledReg);
3036   }
3037 }
3038 
3039 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3040 // return the opcode of an instruction performing the same operation, but using
3041 // the [Reg, Reg] addressing mode.
3042 static unsigned regOffsetOpcode(unsigned Opcode) {
3043   switch (Opcode) {
3044   default:
3045     llvm_unreachable("Address folding not implemented for instruction");
3046 
3047   case AArch64::LDURQi:
3048   case AArch64::LDRQui:
3049     return AArch64::LDRQroX;
3050   case AArch64::STURQi:
3051   case AArch64::STRQui:
3052     return AArch64::STRQroX;
3053   case AArch64::LDURDi:
3054   case AArch64::LDRDui:
3055     return AArch64::LDRDroX;
3056   case AArch64::STURDi:
3057   case AArch64::STRDui:
3058     return AArch64::STRDroX;
3059   case AArch64::LDURXi:
3060   case AArch64::LDRXui:
3061     return AArch64::LDRXroX;
3062   case AArch64::STURXi:
3063   case AArch64::STRXui:
3064     return AArch64::STRXroX;
3065   case AArch64::LDURWi:
3066   case AArch64::LDRWui:
3067     return AArch64::LDRWroX;
3068   case AArch64::LDURSWi:
3069   case AArch64::LDRSWui:
3070     return AArch64::LDRSWroX;
3071   case AArch64::STURWi:
3072   case AArch64::STRWui:
3073     return AArch64::STRWroX;
3074   case AArch64::LDURHi:
3075   case AArch64::LDRHui:
3076     return AArch64::LDRHroX;
3077   case AArch64::STURHi:
3078   case AArch64::STRHui:
3079     return AArch64::STRHroX;
3080   case AArch64::LDURHHi:
3081   case AArch64::LDRHHui:
3082     return AArch64::LDRHHroX;
3083   case AArch64::STURHHi:
3084   case AArch64::STRHHui:
3085     return AArch64::STRHHroX;
3086   case AArch64::LDURSHXi:
3087   case AArch64::LDRSHXui:
3088     return AArch64::LDRSHXroX;
3089   case AArch64::LDURSHWi:
3090   case AArch64::LDRSHWui:
3091     return AArch64::LDRSHWroX;
3092   case AArch64::LDURBi:
3093   case AArch64::LDRBui:
3094     return AArch64::LDRBroX;
3095   case AArch64::LDURBBi:
3096   case AArch64::LDRBBui:
3097     return AArch64::LDRBBroX;
3098   case AArch64::LDURSBXi:
3099   case AArch64::LDRSBXui:
3100     return AArch64::LDRSBXroX;
3101   case AArch64::LDURSBWi:
3102   case AArch64::LDRSBWui:
3103     return AArch64::LDRSBWroX;
3104   case AArch64::STURBi:
3105   case AArch64::STRBui:
3106     return AArch64::STRBroX;
3107   case AArch64::STURBBi:
3108   case AArch64::STRBBui:
3109     return AArch64::STRBBroX;
3110   }
3111 }
3112 
3113 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3114 // the opcode of an instruction performing the same operation, but using the
3115 // [Reg, #Imm] addressing mode with scaled offset.
3116 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3117   switch (Opcode) {
3118   default:
3119     llvm_unreachable("Address folding not implemented for instruction");
3120 
3121   case AArch64::LDURQi:
3122     Scale = 16;
3123     return AArch64::LDRQui;
3124   case AArch64::STURQi:
3125     Scale = 16;
3126     return AArch64::STRQui;
3127   case AArch64::LDURDi:
3128     Scale = 8;
3129     return AArch64::LDRDui;
3130   case AArch64::STURDi:
3131     Scale = 8;
3132     return AArch64::STRDui;
3133   case AArch64::LDURXi:
3134     Scale = 8;
3135     return AArch64::LDRXui;
3136   case AArch64::STURXi:
3137     Scale = 8;
3138     return AArch64::STRXui;
3139   case AArch64::LDURWi:
3140     Scale = 4;
3141     return AArch64::LDRWui;
3142   case AArch64::LDURSWi:
3143     Scale = 4;
3144     return AArch64::LDRSWui;
3145   case AArch64::STURWi:
3146     Scale = 4;
3147     return AArch64::STRWui;
3148   case AArch64::LDURHi:
3149     Scale = 2;
3150     return AArch64::LDRHui;
3151   case AArch64::STURHi:
3152     Scale = 2;
3153     return AArch64::STRHui;
3154   case AArch64::LDURHHi:
3155     Scale = 2;
3156     return AArch64::LDRHHui;
3157   case AArch64::STURHHi:
3158     Scale = 2;
3159     return AArch64::STRHHui;
3160   case AArch64::LDURSHXi:
3161     Scale = 2;
3162     return AArch64::LDRSHXui;
3163   case AArch64::LDURSHWi:
3164     Scale = 2;
3165     return AArch64::LDRSHWui;
3166   case AArch64::LDURBi:
3167     Scale = 1;
3168     return AArch64::LDRBui;
3169   case AArch64::LDURBBi:
3170     Scale = 1;
3171     return AArch64::LDRBBui;
3172   case AArch64::LDURSBXi:
3173     Scale = 1;
3174     return AArch64::LDRSBXui;
3175   case AArch64::LDURSBWi:
3176     Scale = 1;
3177     return AArch64::LDRSBWui;
3178   case AArch64::STURBi:
3179     Scale = 1;
3180     return AArch64::STRBui;
3181   case AArch64::STURBBi:
3182     Scale = 1;
3183     return AArch64::STRBBui;
3184   case AArch64::LDRQui:
3185   case AArch64::STRQui:
3186     Scale = 16;
3187     return Opcode;
3188   case AArch64::LDRDui:
3189   case AArch64::STRDui:
3190   case AArch64::LDRXui:
3191   case AArch64::STRXui:
3192     Scale = 8;
3193     return Opcode;
3194   case AArch64::LDRWui:
3195   case AArch64::LDRSWui:
3196   case AArch64::STRWui:
3197     Scale = 4;
3198     return Opcode;
3199   case AArch64::LDRHui:
3200   case AArch64::STRHui:
3201   case AArch64::LDRHHui:
3202   case AArch64::STRHHui:
3203   case AArch64::LDRSHXui:
3204   case AArch64::LDRSHWui:
3205     Scale = 2;
3206     return Opcode;
3207   case AArch64::LDRBui:
3208   case AArch64::LDRBBui:
3209   case AArch64::LDRSBXui:
3210   case AArch64::LDRSBWui:
3211   case AArch64::STRBui:
3212   case AArch64::STRBBui:
3213     Scale = 1;
3214     return Opcode;
3215   }
3216 }
3217 
3218 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3219 // the opcode of an instruction performing the same operation, but using the
3220 // [Reg, #Imm] addressing mode with unscaled offset.
3221 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3222   switch (Opcode) {
3223   default:
3224     llvm_unreachable("Address folding not implemented for instruction");
3225 
3226   case AArch64::LDURQi:
3227   case AArch64::STURQi:
3228   case AArch64::LDURDi:
3229   case AArch64::STURDi:
3230   case AArch64::LDURXi:
3231   case AArch64::STURXi:
3232   case AArch64::LDURWi:
3233   case AArch64::LDURSWi:
3234   case AArch64::STURWi:
3235   case AArch64::LDURHi:
3236   case AArch64::STURHi:
3237   case AArch64::LDURHHi:
3238   case AArch64::STURHHi:
3239   case AArch64::LDURSHXi:
3240   case AArch64::LDURSHWi:
3241   case AArch64::LDURBi:
3242   case AArch64::STURBi:
3243   case AArch64::LDURBBi:
3244   case AArch64::STURBBi:
3245   case AArch64::LDURSBWi:
3246   case AArch64::LDURSBXi:
3247     return Opcode;
3248   case AArch64::LDRQui:
3249     return AArch64::LDURQi;
3250   case AArch64::STRQui:
3251     return AArch64::STURQi;
3252   case AArch64::LDRDui:
3253     return AArch64::LDURDi;
3254   case AArch64::STRDui:
3255     return AArch64::STURDi;
3256   case AArch64::LDRXui:
3257     return AArch64::LDURXi;
3258   case AArch64::STRXui:
3259     return AArch64::STURXi;
3260   case AArch64::LDRWui:
3261     return AArch64::LDURWi;
3262   case AArch64::LDRSWui:
3263     return AArch64::LDURSWi;
3264   case AArch64::STRWui:
3265     return AArch64::STURWi;
3266   case AArch64::LDRHui:
3267     return AArch64::LDURHi;
3268   case AArch64::STRHui:
3269     return AArch64::STURHi;
3270   case AArch64::LDRHHui:
3271     return AArch64::LDURHHi;
3272   case AArch64::STRHHui:
3273     return AArch64::STURHHi;
3274   case AArch64::LDRSHXui:
3275     return AArch64::LDURSHXi;
3276   case AArch64::LDRSHWui:
3277     return AArch64::LDURSHWi;
3278   case AArch64::LDRBBui:
3279     return AArch64::LDURBBi;
3280   case AArch64::LDRBui:
3281     return AArch64::LDURBi;
3282   case AArch64::STRBBui:
3283     return AArch64::STURBBi;
3284   case AArch64::STRBui:
3285     return AArch64::STURBi;
3286   case AArch64::LDRSBWui:
3287     return AArch64::LDURSBWi;
3288   case AArch64::LDRSBXui:
3289     return AArch64::LDURSBXi;
3290   }
3291 }
3292 
3293 // Given the opcode of a memory load/store instruction, return the opcode of an
3294 // instruction performing the same operation, but using
3295 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3296 // offset register.
3297 static unsigned offsetExtendOpcode(unsigned Opcode) {
3298   switch (Opcode) {
3299   default:
3300     llvm_unreachable("Address folding not implemented for instruction");
3301 
3302   case AArch64::LDRQroX:
3303   case AArch64::LDURQi:
3304   case AArch64::LDRQui:
3305     return AArch64::LDRQroW;
3306   case AArch64::STRQroX:
3307   case AArch64::STURQi:
3308   case AArch64::STRQui:
3309     return AArch64::STRQroW;
3310   case AArch64::LDRDroX:
3311   case AArch64::LDURDi:
3312   case AArch64::LDRDui:
3313     return AArch64::LDRDroW;
3314   case AArch64::STRDroX:
3315   case AArch64::STURDi:
3316   case AArch64::STRDui:
3317     return AArch64::STRDroW;
3318   case AArch64::LDRXroX:
3319   case AArch64::LDURXi:
3320   case AArch64::LDRXui:
3321     return AArch64::LDRXroW;
3322   case AArch64::STRXroX:
3323   case AArch64::STURXi:
3324   case AArch64::STRXui:
3325     return AArch64::STRXroW;
3326   case AArch64::LDRWroX:
3327   case AArch64::LDURWi:
3328   case AArch64::LDRWui:
3329     return AArch64::LDRWroW;
3330   case AArch64::LDRSWroX:
3331   case AArch64::LDURSWi:
3332   case AArch64::LDRSWui:
3333     return AArch64::LDRSWroW;
3334   case AArch64::STRWroX:
3335   case AArch64::STURWi:
3336   case AArch64::STRWui:
3337     return AArch64::STRWroW;
3338   case AArch64::LDRHroX:
3339   case AArch64::LDURHi:
3340   case AArch64::LDRHui:
3341     return AArch64::LDRHroW;
3342   case AArch64::STRHroX:
3343   case AArch64::STURHi:
3344   case AArch64::STRHui:
3345     return AArch64::STRHroW;
3346   case AArch64::LDRHHroX:
3347   case AArch64::LDURHHi:
3348   case AArch64::LDRHHui:
3349     return AArch64::LDRHHroW;
3350   case AArch64::STRHHroX:
3351   case AArch64::STURHHi:
3352   case AArch64::STRHHui:
3353     return AArch64::STRHHroW;
3354   case AArch64::LDRSHXroX:
3355   case AArch64::LDURSHXi:
3356   case AArch64::LDRSHXui:
3357     return AArch64::LDRSHXroW;
3358   case AArch64::LDRSHWroX:
3359   case AArch64::LDURSHWi:
3360   case AArch64::LDRSHWui:
3361     return AArch64::LDRSHWroW;
3362   case AArch64::LDRBroX:
3363   case AArch64::LDURBi:
3364   case AArch64::LDRBui:
3365     return AArch64::LDRBroW;
3366   case AArch64::LDRBBroX:
3367   case AArch64::LDURBBi:
3368   case AArch64::LDRBBui:
3369     return AArch64::LDRBBroW;
3370   case AArch64::LDRSBXroX:
3371   case AArch64::LDURSBXi:
3372   case AArch64::LDRSBXui:
3373     return AArch64::LDRSBXroW;
3374   case AArch64::LDRSBWroX:
3375   case AArch64::LDURSBWi:
3376   case AArch64::LDRSBWui:
3377     return AArch64::LDRSBWroW;
3378   case AArch64::STRBroX:
3379   case AArch64::STURBi:
3380   case AArch64::STRBui:
3381     return AArch64::STRBroW;
3382   case AArch64::STRBBroX:
3383   case AArch64::STURBBi:
3384   case AArch64::STRBBui:
3385     return AArch64::STRBBroW;
3386   }
3387 }
3388 
3389 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3390                                                  const ExtAddrMode &AM) const {
3391 
3392   const DebugLoc &DL = MemI.getDebugLoc();
3393   MachineBasicBlock &MBB = *MemI.getParent();
3394   MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3395 
3396   if (AM.Form == ExtAddrMode::Formula::Basic) {
3397     if (AM.ScaledReg) {
3398       // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3399       unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3400       MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3401       auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3402                    .addReg(MemI.getOperand(0).getReg(),
3403                            MemI.mayLoad() ? RegState::Define : 0)
3404                    .addReg(AM.BaseReg)
3405                    .addReg(AM.ScaledReg)
3406                    .addImm(0)
3407                    .addImm(AM.Scale > 1)
3408                    .setMemRefs(MemI.memoperands())
3409                    .setMIFlags(MemI.getFlags());
3410       return B.getInstr();
3411     }
3412 
3413     assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3414            "Addressing mode not supported for folding");
3415 
3416     // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3417     unsigned Scale = 1;
3418     unsigned Opcode = MemI.getOpcode();
3419     if (isInt<9>(AM.Displacement))
3420       Opcode = unscaledOffsetOpcode(Opcode);
3421     else
3422       Opcode = scaledOffsetOpcode(Opcode, Scale);
3423 
3424     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3425                  .addReg(MemI.getOperand(0).getReg(),
3426                          MemI.mayLoad() ? RegState::Define : 0)
3427                  .addReg(AM.BaseReg)
3428                  .addImm(AM.Displacement / Scale)
3429                  .setMemRefs(MemI.memoperands())
3430                  .setMIFlags(MemI.getFlags());
3431     return B.getInstr();
3432   }
3433 
3434   if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3435       AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3436     // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3437     assert(AM.ScaledReg && !AM.Displacement &&
3438            "Address offset can be a register or an immediate, but not both");
3439     unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3440     MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3441     // Make sure the offset register is in the correct register class.
3442     Register OffsetReg = AM.ScaledReg;
3443     const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3444     if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3445       OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3446       BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3447           .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3448     }
3449     auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3450                  .addReg(MemI.getOperand(0).getReg(),
3451                          MemI.mayLoad() ? RegState::Define : 0)
3452                  .addReg(AM.BaseReg)
3453                  .addReg(OffsetReg)
3454                  .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3455                  .addImm(AM.Scale != 1)
3456                  .setMemRefs(MemI.memoperands())
3457                  .setMIFlags(MemI.getFlags());
3458 
3459     return B.getInstr();
3460   }
3461 
3462   llvm_unreachable(
3463       "Function must not be called with an addressing mode it can't handle");
3464 }
3465 
3466 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3467     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3468     bool &OffsetIsScalable, TypeSize &Width,
3469     const TargetRegisterInfo *TRI) const {
3470   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3471   // Handle only loads/stores with base register followed by immediate offset.
3472   if (LdSt.getNumExplicitOperands() == 3) {
3473     // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3474     if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3475         !LdSt.getOperand(2).isImm())
3476       return false;
3477   } else if (LdSt.getNumExplicitOperands() == 4) {
3478     // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3479     if (!LdSt.getOperand(1).isReg() ||
3480         (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3481         !LdSt.getOperand(3).isImm())
3482       return false;
3483   } else
3484     return false;
3485 
3486   // Get the scaling factor for the instruction and set the width for the
3487   // instruction.
3488   TypeSize Scale(0U, false);
3489   int64_t Dummy1, Dummy2;
3490 
3491   // If this returns false, then it's an instruction we don't want to handle.
3492   if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3493     return false;
3494 
3495   // Compute the offset. Offset is calculated as the immediate operand
3496   // multiplied by the scaling factor. Unscaled instructions have scaling factor
3497   // set to 1.
3498   if (LdSt.getNumExplicitOperands() == 3) {
3499     BaseOp = &LdSt.getOperand(1);
3500     Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3501   } else {
3502     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3503     BaseOp = &LdSt.getOperand(2);
3504     Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3505   }
3506   OffsetIsScalable = Scale.isScalable();
3507 
3508   if (!BaseOp->isReg() && !BaseOp->isFI())
3509     return false;
3510 
3511   return true;
3512 }
3513 
3514 MachineOperand &
3515 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3516   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3517   MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3518   assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3519   return OfsOp;
3520 }
3521 
3522 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3523                                     TypeSize &Width, int64_t &MinOffset,
3524                                     int64_t &MaxOffset) {
3525   switch (Opcode) {
3526   // Not a memory operation or something we want to handle.
3527   default:
3528     Scale = TypeSize::getFixed(0);
3529     Width = TypeSize::getFixed(0);
3530     MinOffset = MaxOffset = 0;
3531     return false;
3532   case AArch64::STRWpost:
3533   case AArch64::LDRWpost:
3534     Width = TypeSize::getFixed(32);
3535     Scale = TypeSize::getFixed(4);
3536     MinOffset = -256;
3537     MaxOffset = 255;
3538     break;
3539   case AArch64::LDURQi:
3540   case AArch64::STURQi:
3541     Width = TypeSize::getFixed(16);
3542     Scale = TypeSize::getFixed(1);
3543     MinOffset = -256;
3544     MaxOffset = 255;
3545     break;
3546   case AArch64::PRFUMi:
3547   case AArch64::LDURXi:
3548   case AArch64::LDURDi:
3549   case AArch64::LDAPURXi:
3550   case AArch64::STURXi:
3551   case AArch64::STURDi:
3552   case AArch64::STLURXi:
3553     Width = TypeSize::getFixed(8);
3554     Scale = TypeSize::getFixed(1);
3555     MinOffset = -256;
3556     MaxOffset = 255;
3557     break;
3558   case AArch64::LDURWi:
3559   case AArch64::LDURSi:
3560   case AArch64::LDURSWi:
3561   case AArch64::LDAPURi:
3562   case AArch64::LDAPURSWi:
3563   case AArch64::STURWi:
3564   case AArch64::STURSi:
3565   case AArch64::STLURWi:
3566     Width = TypeSize::getFixed(4);
3567     Scale = TypeSize::getFixed(1);
3568     MinOffset = -256;
3569     MaxOffset = 255;
3570     break;
3571   case AArch64::LDURHi:
3572   case AArch64::LDURHHi:
3573   case AArch64::LDURSHXi:
3574   case AArch64::LDURSHWi:
3575   case AArch64::LDAPURHi:
3576   case AArch64::LDAPURSHWi:
3577   case AArch64::LDAPURSHXi:
3578   case AArch64::STURHi:
3579   case AArch64::STURHHi:
3580   case AArch64::STLURHi:
3581     Width = TypeSize::getFixed(2);
3582     Scale = TypeSize::getFixed(1);
3583     MinOffset = -256;
3584     MaxOffset = 255;
3585     break;
3586   case AArch64::LDURBi:
3587   case AArch64::LDURBBi:
3588   case AArch64::LDURSBXi:
3589   case AArch64::LDURSBWi:
3590   case AArch64::LDAPURBi:
3591   case AArch64::LDAPURSBWi:
3592   case AArch64::LDAPURSBXi:
3593   case AArch64::STURBi:
3594   case AArch64::STURBBi:
3595   case AArch64::STLURBi:
3596     Width = TypeSize::getFixed(1);
3597     Scale = TypeSize::getFixed(1);
3598     MinOffset = -256;
3599     MaxOffset = 255;
3600     break;
3601   case AArch64::LDPQi:
3602   case AArch64::LDNPQi:
3603   case AArch64::STPQi:
3604   case AArch64::STNPQi:
3605     Scale = TypeSize::getFixed(16);
3606     Width = TypeSize::getFixed(32);
3607     MinOffset = -64;
3608     MaxOffset = 63;
3609     break;
3610   case AArch64::LDRQui:
3611   case AArch64::STRQui:
3612     Scale = TypeSize::getFixed(16);
3613     Width = TypeSize::getFixed(16);
3614     MinOffset = 0;
3615     MaxOffset = 4095;
3616     break;
3617   case AArch64::LDPXi:
3618   case AArch64::LDPDi:
3619   case AArch64::LDNPXi:
3620   case AArch64::LDNPDi:
3621   case AArch64::STPXi:
3622   case AArch64::STPDi:
3623   case AArch64::STNPXi:
3624   case AArch64::STNPDi:
3625     Scale = TypeSize::getFixed(8);
3626     Width = TypeSize::getFixed(16);
3627     MinOffset = -64;
3628     MaxOffset = 63;
3629     break;
3630   case AArch64::PRFMui:
3631   case AArch64::LDRXui:
3632   case AArch64::LDRDui:
3633   case AArch64::STRXui:
3634   case AArch64::STRDui:
3635     Scale = TypeSize::getFixed(8);
3636     Width = TypeSize::getFixed(8);
3637     MinOffset = 0;
3638     MaxOffset = 4095;
3639     break;
3640   case AArch64::StoreSwiftAsyncContext:
3641     // Store is an STRXui, but there might be an ADDXri in the expansion too.
3642     Scale = TypeSize::getFixed(1);
3643     Width = TypeSize::getFixed(8);
3644     MinOffset = 0;
3645     MaxOffset = 4095;
3646     break;
3647   case AArch64::LDPWi:
3648   case AArch64::LDPSi:
3649   case AArch64::LDNPWi:
3650   case AArch64::LDNPSi:
3651   case AArch64::STPWi:
3652   case AArch64::STPSi:
3653   case AArch64::STNPWi:
3654   case AArch64::STNPSi:
3655     Scale = TypeSize::getFixed(4);
3656     Width = TypeSize::getFixed(8);
3657     MinOffset = -64;
3658     MaxOffset = 63;
3659     break;
3660   case AArch64::LDRWui:
3661   case AArch64::LDRSui:
3662   case AArch64::LDRSWui:
3663   case AArch64::STRWui:
3664   case AArch64::STRSui:
3665     Scale = TypeSize::getFixed(4);
3666     Width = TypeSize::getFixed(4);
3667     MinOffset = 0;
3668     MaxOffset = 4095;
3669     break;
3670   case AArch64::LDRHui:
3671   case AArch64::LDRHHui:
3672   case AArch64::LDRSHWui:
3673   case AArch64::LDRSHXui:
3674   case AArch64::STRHui:
3675   case AArch64::STRHHui:
3676     Scale = TypeSize::getFixed(2);
3677     Width = TypeSize::getFixed(2);
3678     MinOffset = 0;
3679     MaxOffset = 4095;
3680     break;
3681   case AArch64::LDRBui:
3682   case AArch64::LDRBBui:
3683   case AArch64::LDRSBWui:
3684   case AArch64::LDRSBXui:
3685   case AArch64::STRBui:
3686   case AArch64::STRBBui:
3687     Scale = TypeSize::getFixed(1);
3688     Width = TypeSize::getFixed(1);
3689     MinOffset = 0;
3690     MaxOffset = 4095;
3691     break;
3692   case AArch64::STPXpre:
3693   case AArch64::LDPXpost:
3694   case AArch64::STPDpre:
3695   case AArch64::LDPDpost:
3696     Scale = TypeSize::getFixed(8);
3697     Width = TypeSize::getFixed(8);
3698     MinOffset = -512;
3699     MaxOffset = 504;
3700     break;
3701   case AArch64::STPQpre:
3702   case AArch64::LDPQpost:
3703     Scale = TypeSize::getFixed(16);
3704     Width = TypeSize::getFixed(16);
3705     MinOffset = -1024;
3706     MaxOffset = 1008;
3707     break;
3708   case AArch64::STRXpre:
3709   case AArch64::STRDpre:
3710   case AArch64::LDRXpost:
3711   case AArch64::LDRDpost:
3712     Scale = TypeSize::getFixed(1);
3713     Width = TypeSize::getFixed(8);
3714     MinOffset = -256;
3715     MaxOffset = 255;
3716     break;
3717   case AArch64::STRQpre:
3718   case AArch64::LDRQpost:
3719     Scale = TypeSize::getFixed(1);
3720     Width = TypeSize::getFixed(16);
3721     MinOffset = -256;
3722     MaxOffset = 255;
3723     break;
3724   case AArch64::ADDG:
3725     Scale = TypeSize::getFixed(16);
3726     Width = TypeSize::getFixed(0);
3727     MinOffset = 0;
3728     MaxOffset = 63;
3729     break;
3730   case AArch64::TAGPstack:
3731     Scale = TypeSize::getFixed(16);
3732     Width = TypeSize::getFixed(0);
3733     // TAGP with a negative offset turns into SUBP, which has a maximum offset
3734     // of 63 (not 64!).
3735     MinOffset = -63;
3736     MaxOffset = 63;
3737     break;
3738   case AArch64::LDG:
3739   case AArch64::STGi:
3740   case AArch64::STZGi:
3741     Scale = TypeSize::getFixed(16);
3742     Width = TypeSize::getFixed(16);
3743     MinOffset = -256;
3744     MaxOffset = 255;
3745     break;
3746   case AArch64::STR_ZZZZXI:
3747   case AArch64::LDR_ZZZZXI:
3748     Scale = TypeSize::getScalable(16);
3749     Width = TypeSize::getScalable(16 * 4);
3750     MinOffset = -256;
3751     MaxOffset = 252;
3752     break;
3753   case AArch64::STR_ZZZXI:
3754   case AArch64::LDR_ZZZXI:
3755     Scale = TypeSize::getScalable(16);
3756     Width = TypeSize::getScalable(16 * 3);
3757     MinOffset = -256;
3758     MaxOffset = 253;
3759     break;
3760   case AArch64::STR_ZZXI:
3761   case AArch64::LDR_ZZXI:
3762     Scale = TypeSize::getScalable(16);
3763     Width = TypeSize::getScalable(16 * 2);
3764     MinOffset = -256;
3765     MaxOffset = 254;
3766     break;
3767   case AArch64::LDR_PXI:
3768   case AArch64::STR_PXI:
3769     Scale = TypeSize::getScalable(2);
3770     Width = TypeSize::getScalable(2);
3771     MinOffset = -256;
3772     MaxOffset = 255;
3773     break;
3774   case AArch64::LDR_PPXI:
3775   case AArch64::STR_PPXI:
3776     Scale = TypeSize::getScalable(2);
3777     Width = TypeSize::getScalable(2 * 2);
3778     MinOffset = -256;
3779     MaxOffset = 254;
3780     break;
3781   case AArch64::LDR_ZXI:
3782   case AArch64::STR_ZXI:
3783     Scale = TypeSize::getScalable(16);
3784     Width = TypeSize::getScalable(16);
3785     MinOffset = -256;
3786     MaxOffset = 255;
3787     break;
3788   case AArch64::LD1B_IMM:
3789   case AArch64::LD1H_IMM:
3790   case AArch64::LD1W_IMM:
3791   case AArch64::LD1D_IMM:
3792   case AArch64::LDNT1B_ZRI:
3793   case AArch64::LDNT1H_ZRI:
3794   case AArch64::LDNT1W_ZRI:
3795   case AArch64::LDNT1D_ZRI:
3796   case AArch64::ST1B_IMM:
3797   case AArch64::ST1H_IMM:
3798   case AArch64::ST1W_IMM:
3799   case AArch64::ST1D_IMM:
3800   case AArch64::STNT1B_ZRI:
3801   case AArch64::STNT1H_ZRI:
3802   case AArch64::STNT1W_ZRI:
3803   case AArch64::STNT1D_ZRI:
3804   case AArch64::LDNF1B_IMM:
3805   case AArch64::LDNF1H_IMM:
3806   case AArch64::LDNF1W_IMM:
3807   case AArch64::LDNF1D_IMM:
3808     // A full vectors worth of data
3809     // Width = mbytes * elements
3810     Scale = TypeSize::getScalable(16);
3811     Width = TypeSize::getScalable(16);
3812     MinOffset = -8;
3813     MaxOffset = 7;
3814     break;
3815   case AArch64::LD2B_IMM:
3816   case AArch64::LD2H_IMM:
3817   case AArch64::LD2W_IMM:
3818   case AArch64::LD2D_IMM:
3819   case AArch64::ST2B_IMM:
3820   case AArch64::ST2H_IMM:
3821   case AArch64::ST2W_IMM:
3822   case AArch64::ST2D_IMM:
3823     Scale = TypeSize::getScalable(32);
3824     Width = TypeSize::getScalable(16 * 2);
3825     MinOffset = -8;
3826     MaxOffset = 7;
3827     break;
3828   case AArch64::LD3B_IMM:
3829   case AArch64::LD3H_IMM:
3830   case AArch64::LD3W_IMM:
3831   case AArch64::LD3D_IMM:
3832   case AArch64::ST3B_IMM:
3833   case AArch64::ST3H_IMM:
3834   case AArch64::ST3W_IMM:
3835   case AArch64::ST3D_IMM:
3836     Scale = TypeSize::getScalable(48);
3837     Width = TypeSize::getScalable(16 * 3);
3838     MinOffset = -8;
3839     MaxOffset = 7;
3840     break;
3841   case AArch64::LD4B_IMM:
3842   case AArch64::LD4H_IMM:
3843   case AArch64::LD4W_IMM:
3844   case AArch64::LD4D_IMM:
3845   case AArch64::ST4B_IMM:
3846   case AArch64::ST4H_IMM:
3847   case AArch64::ST4W_IMM:
3848   case AArch64::ST4D_IMM:
3849     Scale = TypeSize::getScalable(64);
3850     Width = TypeSize::getScalable(16 * 4);
3851     MinOffset = -8;
3852     MaxOffset = 7;
3853     break;
3854   case AArch64::LD1B_H_IMM:
3855   case AArch64::LD1SB_H_IMM:
3856   case AArch64::LD1H_S_IMM:
3857   case AArch64::LD1SH_S_IMM:
3858   case AArch64::LD1W_D_IMM:
3859   case AArch64::LD1SW_D_IMM:
3860   case AArch64::ST1B_H_IMM:
3861   case AArch64::ST1H_S_IMM:
3862   case AArch64::ST1W_D_IMM:
3863   case AArch64::LDNF1B_H_IMM:
3864   case AArch64::LDNF1SB_H_IMM:
3865   case AArch64::LDNF1H_S_IMM:
3866   case AArch64::LDNF1SH_S_IMM:
3867   case AArch64::LDNF1W_D_IMM:
3868   case AArch64::LDNF1SW_D_IMM:
3869     // A half vector worth of data
3870     // Width = mbytes * elements
3871     Scale = TypeSize::getScalable(8);
3872     Width = TypeSize::getScalable(8);
3873     MinOffset = -8;
3874     MaxOffset = 7;
3875     break;
3876   case AArch64::LD1B_S_IMM:
3877   case AArch64::LD1SB_S_IMM:
3878   case AArch64::LD1H_D_IMM:
3879   case AArch64::LD1SH_D_IMM:
3880   case AArch64::ST1B_S_IMM:
3881   case AArch64::ST1H_D_IMM:
3882   case AArch64::LDNF1B_S_IMM:
3883   case AArch64::LDNF1SB_S_IMM:
3884   case AArch64::LDNF1H_D_IMM:
3885   case AArch64::LDNF1SH_D_IMM:
3886     // A quarter vector worth of data
3887     // Width = mbytes * elements
3888     Scale = TypeSize::getScalable(4);
3889     Width = TypeSize::getScalable(4);
3890     MinOffset = -8;
3891     MaxOffset = 7;
3892     break;
3893   case AArch64::LD1B_D_IMM:
3894   case AArch64::LD1SB_D_IMM:
3895   case AArch64::ST1B_D_IMM:
3896   case AArch64::LDNF1B_D_IMM:
3897   case AArch64::LDNF1SB_D_IMM:
3898     // A eighth vector worth of data
3899     // Width = mbytes * elements
3900     Scale = TypeSize::getScalable(2);
3901     Width = TypeSize::getScalable(2);
3902     MinOffset = -8;
3903     MaxOffset = 7;
3904     break;
3905   case AArch64::ST2Gi:
3906   case AArch64::STZ2Gi:
3907     Scale = TypeSize::getFixed(16);
3908     Width = TypeSize::getFixed(32);
3909     MinOffset = -256;
3910     MaxOffset = 255;
3911     break;
3912   case AArch64::STGPi:
3913     Scale = TypeSize::getFixed(16);
3914     Width = TypeSize::getFixed(16);
3915     MinOffset = -64;
3916     MaxOffset = 63;
3917     break;
3918   case AArch64::LD1RB_IMM:
3919   case AArch64::LD1RB_H_IMM:
3920   case AArch64::LD1RB_S_IMM:
3921   case AArch64::LD1RB_D_IMM:
3922   case AArch64::LD1RSB_H_IMM:
3923   case AArch64::LD1RSB_S_IMM:
3924   case AArch64::LD1RSB_D_IMM:
3925     Scale = TypeSize::getFixed(1);
3926     Width = TypeSize::getFixed(1);
3927     MinOffset = 0;
3928     MaxOffset = 63;
3929     break;
3930   case AArch64::LD1RH_IMM:
3931   case AArch64::LD1RH_S_IMM:
3932   case AArch64::LD1RH_D_IMM:
3933   case AArch64::LD1RSH_S_IMM:
3934   case AArch64::LD1RSH_D_IMM:
3935     Scale = TypeSize::getFixed(2);
3936     Width = TypeSize::getFixed(2);
3937     MinOffset = 0;
3938     MaxOffset = 63;
3939     break;
3940   case AArch64::LD1RW_IMM:
3941   case AArch64::LD1RW_D_IMM:
3942   case AArch64::LD1RSW_IMM:
3943     Scale = TypeSize::getFixed(4);
3944     Width = TypeSize::getFixed(4);
3945     MinOffset = 0;
3946     MaxOffset = 63;
3947     break;
3948   case AArch64::LD1RD_IMM:
3949     Scale = TypeSize::getFixed(8);
3950     Width = TypeSize::getFixed(8);
3951     MinOffset = 0;
3952     MaxOffset = 63;
3953     break;
3954   }
3955 
3956   return true;
3957 }
3958 
3959 // Scaling factor for unscaled load or store.
3960 int AArch64InstrInfo::getMemScale(unsigned Opc) {
3961   switch (Opc) {
3962   default:
3963     llvm_unreachable("Opcode has unknown scale!");
3964   case AArch64::LDRBBui:
3965   case AArch64::LDURBBi:
3966   case AArch64::LDRSBWui:
3967   case AArch64::LDURSBWi:
3968   case AArch64::STRBBui:
3969   case AArch64::STURBBi:
3970     return 1;
3971   case AArch64::LDRHHui:
3972   case AArch64::LDURHHi:
3973   case AArch64::LDRSHWui:
3974   case AArch64::LDURSHWi:
3975   case AArch64::STRHHui:
3976   case AArch64::STURHHi:
3977     return 2;
3978   case AArch64::LDRSui:
3979   case AArch64::LDURSi:
3980   case AArch64::LDRSpre:
3981   case AArch64::LDRSWui:
3982   case AArch64::LDURSWi:
3983   case AArch64::LDRSWpre:
3984   case AArch64::LDRWpre:
3985   case AArch64::LDRWui:
3986   case AArch64::LDURWi:
3987   case AArch64::STRSui:
3988   case AArch64::STURSi:
3989   case AArch64::STRSpre:
3990   case AArch64::STRWui:
3991   case AArch64::STURWi:
3992   case AArch64::STRWpre:
3993   case AArch64::LDPSi:
3994   case AArch64::LDPSWi:
3995   case AArch64::LDPWi:
3996   case AArch64::STPSi:
3997   case AArch64::STPWi:
3998     return 4;
3999   case AArch64::LDRDui:
4000   case AArch64::LDURDi:
4001   case AArch64::LDRDpre:
4002   case AArch64::LDRXui:
4003   case AArch64::LDURXi:
4004   case AArch64::LDRXpre:
4005   case AArch64::STRDui:
4006   case AArch64::STURDi:
4007   case AArch64::STRDpre:
4008   case AArch64::STRXui:
4009   case AArch64::STURXi:
4010   case AArch64::STRXpre:
4011   case AArch64::LDPDi:
4012   case AArch64::LDPXi:
4013   case AArch64::STPDi:
4014   case AArch64::STPXi:
4015     return 8;
4016   case AArch64::LDRQui:
4017   case AArch64::LDURQi:
4018   case AArch64::STRQui:
4019   case AArch64::STURQi:
4020   case AArch64::STRQpre:
4021   case AArch64::LDPQi:
4022   case AArch64::LDRQpre:
4023   case AArch64::STPQi:
4024   case AArch64::STGi:
4025   case AArch64::STZGi:
4026   case AArch64::ST2Gi:
4027   case AArch64::STZ2Gi:
4028   case AArch64::STGPi:
4029     return 16;
4030   }
4031 }
4032 
4033 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4034   switch (MI.getOpcode()) {
4035   default:
4036     return false;
4037   case AArch64::LDRWpre:
4038   case AArch64::LDRXpre:
4039   case AArch64::LDRSWpre:
4040   case AArch64::LDRSpre:
4041   case AArch64::LDRDpre:
4042   case AArch64::LDRQpre:
4043     return true;
4044   }
4045 }
4046 
4047 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4048   switch (MI.getOpcode()) {
4049   default:
4050     return false;
4051   case AArch64::STRWpre:
4052   case AArch64::STRXpre:
4053   case AArch64::STRSpre:
4054   case AArch64::STRDpre:
4055   case AArch64::STRQpre:
4056     return true;
4057   }
4058 }
4059 
4060 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4061   return isPreLd(MI) || isPreSt(MI);
4062 }
4063 
4064 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4065   switch (MI.getOpcode()) {
4066   default:
4067     return false;
4068   case AArch64::LDPSi:
4069   case AArch64::LDPSWi:
4070   case AArch64::LDPDi:
4071   case AArch64::LDPQi:
4072   case AArch64::LDPWi:
4073   case AArch64::LDPXi:
4074   case AArch64::STPSi:
4075   case AArch64::STPDi:
4076   case AArch64::STPQi:
4077   case AArch64::STPWi:
4078   case AArch64::STPXi:
4079   case AArch64::STGPi:
4080     return true;
4081   }
4082 }
4083 
4084 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4085   unsigned Idx =
4086       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4087                                                                             : 1;
4088   return MI.getOperand(Idx);
4089 }
4090 
4091 const MachineOperand &
4092 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4093   unsigned Idx =
4094       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4095                                                                             : 2;
4096   return MI.getOperand(Idx);
4097 }
4098 
4099 const MachineOperand &
4100 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4101   switch (MI.getOpcode()) {
4102   default:
4103     llvm_unreachable("Unexpected opcode");
4104   case AArch64::LDRBBroX:
4105     return MI.getOperand(4);
4106   }
4107 }
4108 
4109 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4110                                               Register Reg) {
4111   if (MI.getParent() == nullptr)
4112     return nullptr;
4113   const MachineFunction *MF = MI.getParent()->getParent();
4114   return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4115 }
4116 
4117 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4118   auto IsHFPR = [&](const MachineOperand &Op) {
4119     if (!Op.isReg())
4120       return false;
4121     auto Reg = Op.getReg();
4122     if (Reg.isPhysical())
4123       return AArch64::FPR16RegClass.contains(Reg);
4124     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4125     return TRC == &AArch64::FPR16RegClass ||
4126            TRC == &AArch64::FPR16_loRegClass;
4127   };
4128   return llvm::any_of(MI.operands(), IsHFPR);
4129 }
4130 
4131 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4132   auto IsQFPR = [&](const MachineOperand &Op) {
4133     if (!Op.isReg())
4134       return false;
4135     auto Reg = Op.getReg();
4136     if (Reg.isPhysical())
4137       return AArch64::FPR128RegClass.contains(Reg);
4138     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4139     return TRC == &AArch64::FPR128RegClass ||
4140            TRC == &AArch64::FPR128_loRegClass;
4141   };
4142   return llvm::any_of(MI.operands(), IsQFPR);
4143 }
4144 
4145 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4146   switch (MI.getOpcode()) {
4147   case AArch64::BRK:
4148   case AArch64::HLT:
4149   case AArch64::PACIASP:
4150   case AArch64::PACIBSP:
4151     // Implicit BTI behavior.
4152     return true;
4153   case AArch64::PAUTH_PROLOGUE:
4154     // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4155     return true;
4156   case AArch64::HINT: {
4157     unsigned Imm = MI.getOperand(0).getImm();
4158     // Explicit BTI instruction.
4159     if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4160       return true;
4161     // PACI(A|B)SP instructions.
4162     if (Imm == 25 || Imm == 27)
4163       return true;
4164     return false;
4165   }
4166   default:
4167     return false;
4168   }
4169 }
4170 
4171 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4172   auto IsFPR = [&](const MachineOperand &Op) {
4173     if (!Op.isReg())
4174       return false;
4175     auto Reg = Op.getReg();
4176     if (Reg.isPhysical())
4177       return AArch64::FPR128RegClass.contains(Reg) ||
4178              AArch64::FPR64RegClass.contains(Reg) ||
4179              AArch64::FPR32RegClass.contains(Reg) ||
4180              AArch64::FPR16RegClass.contains(Reg) ||
4181              AArch64::FPR8RegClass.contains(Reg);
4182 
4183     const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4184     return TRC == &AArch64::FPR128RegClass ||
4185            TRC == &AArch64::FPR128_loRegClass ||
4186            TRC == &AArch64::FPR64RegClass ||
4187            TRC == &AArch64::FPR64_loRegClass ||
4188            TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4189            TRC == &AArch64::FPR8RegClass;
4190   };
4191   return llvm::any_of(MI.operands(), IsFPR);
4192 }
4193 
4194 // Scale the unscaled offsets.  Returns false if the unscaled offset can't be
4195 // scaled.
4196 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4197   int Scale = AArch64InstrInfo::getMemScale(Opc);
4198 
4199   // If the byte-offset isn't a multiple of the stride, we can't scale this
4200   // offset.
4201   if (Offset % Scale != 0)
4202     return false;
4203 
4204   // Convert the byte-offset used by unscaled into an "element" offset used
4205   // by the scaled pair load/store instructions.
4206   Offset /= Scale;
4207   return true;
4208 }
4209 
4210 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4211   if (FirstOpc == SecondOpc)
4212     return true;
4213   // We can also pair sign-ext and zero-ext instructions.
4214   switch (FirstOpc) {
4215   default:
4216     return false;
4217   case AArch64::LDRWui:
4218   case AArch64::LDURWi:
4219     return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4220   case AArch64::LDRSWui:
4221   case AArch64::LDURSWi:
4222     return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4223   }
4224   // These instructions can't be paired based on their opcodes.
4225   return false;
4226 }
4227 
4228 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4229                             int64_t Offset1, unsigned Opcode1, int FI2,
4230                             int64_t Offset2, unsigned Opcode2) {
4231   // Accesses through fixed stack object frame indices may access a different
4232   // fixed stack slot. Check that the object offsets + offsets match.
4233   if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4234     int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4235     int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4236     assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4237     // Convert to scaled object offsets.
4238     int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4239     if (ObjectOffset1 % Scale1 != 0)
4240       return false;
4241     ObjectOffset1 /= Scale1;
4242     int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4243     if (ObjectOffset2 % Scale2 != 0)
4244       return false;
4245     ObjectOffset2 /= Scale2;
4246     ObjectOffset1 += Offset1;
4247     ObjectOffset2 += Offset2;
4248     return ObjectOffset1 + 1 == ObjectOffset2;
4249   }
4250 
4251   return FI1 == FI2;
4252 }
4253 
4254 /// Detect opportunities for ldp/stp formation.
4255 ///
4256 /// Only called for LdSt for which getMemOperandWithOffset returns true.
4257 bool AArch64InstrInfo::shouldClusterMemOps(
4258     ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4259     bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4260     int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4261     unsigned NumBytes) const {
4262   assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4263   const MachineOperand &BaseOp1 = *BaseOps1.front();
4264   const MachineOperand &BaseOp2 = *BaseOps2.front();
4265   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4266   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4267   if (BaseOp1.getType() != BaseOp2.getType())
4268     return false;
4269 
4270   assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4271          "Only base registers and frame indices are supported.");
4272 
4273   // Check for both base regs and base FI.
4274   if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4275     return false;
4276 
4277   // Only cluster up to a single pair.
4278   if (ClusterSize > 2)
4279     return false;
4280 
4281   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4282     return false;
4283 
4284   // Can we pair these instructions based on their opcodes?
4285   unsigned FirstOpc = FirstLdSt.getOpcode();
4286   unsigned SecondOpc = SecondLdSt.getOpcode();
4287   if (!canPairLdStOpc(FirstOpc, SecondOpc))
4288     return false;
4289 
4290   // Can't merge volatiles or load/stores that have a hint to avoid pair
4291   // formation, for example.
4292   if (!isCandidateToMergeOrPair(FirstLdSt) ||
4293       !isCandidateToMergeOrPair(SecondLdSt))
4294     return false;
4295 
4296   // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4297   int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4298   if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4299     return false;
4300 
4301   int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4302   if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4303     return false;
4304 
4305   // Pairwise instructions have a 7-bit signed offset field.
4306   if (Offset1 > 63 || Offset1 < -64)
4307     return false;
4308 
4309   // The caller should already have ordered First/SecondLdSt by offset.
4310   // Note: except for non-equal frame index bases
4311   if (BaseOp1.isFI()) {
4312     assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4313            "Caller should have ordered offsets.");
4314 
4315     const MachineFrameInfo &MFI =
4316         FirstLdSt.getParent()->getParent()->getFrameInfo();
4317     return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4318                            BaseOp2.getIndex(), Offset2, SecondOpc);
4319   }
4320 
4321   assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4322 
4323   return Offset1 + 1 == Offset2;
4324 }
4325 
4326 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4327                                             unsigned Reg, unsigned SubIdx,
4328                                             unsigned State,
4329                                             const TargetRegisterInfo *TRI) {
4330   if (!SubIdx)
4331     return MIB.addReg(Reg, State);
4332 
4333   if (Register::isPhysicalRegister(Reg))
4334     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4335   return MIB.addReg(Reg, State, SubIdx);
4336 }
4337 
4338 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4339                                         unsigned NumRegs) {
4340   // We really want the positive remainder mod 32 here, that happens to be
4341   // easily obtainable with a mask.
4342   return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4343 }
4344 
4345 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4346                                         MachineBasicBlock::iterator I,
4347                                         const DebugLoc &DL, MCRegister DestReg,
4348                                         MCRegister SrcReg, bool KillSrc,
4349                                         unsigned Opcode,
4350                                         ArrayRef<unsigned> Indices) const {
4351   assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4352   const TargetRegisterInfo *TRI = &getRegisterInfo();
4353   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4354   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4355   unsigned NumRegs = Indices.size();
4356 
4357   int SubReg = 0, End = NumRegs, Incr = 1;
4358   if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4359     SubReg = NumRegs - 1;
4360     End = -1;
4361     Incr = -1;
4362   }
4363 
4364   for (; SubReg != End; SubReg += Incr) {
4365     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4366     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4367     AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4368     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4369   }
4370 }
4371 
4372 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4373                                        MachineBasicBlock::iterator I,
4374                                        DebugLoc DL, unsigned DestReg,
4375                                        unsigned SrcReg, bool KillSrc,
4376                                        unsigned Opcode, unsigned ZeroReg,
4377                                        llvm::ArrayRef<unsigned> Indices) const {
4378   const TargetRegisterInfo *TRI = &getRegisterInfo();
4379   unsigned NumRegs = Indices.size();
4380 
4381 #ifndef NDEBUG
4382   uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4383   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4384   assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4385          "GPR reg sequences should not be able to overlap");
4386 #endif
4387 
4388   for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4389     const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4390     AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4391     MIB.addReg(ZeroReg);
4392     AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4393     MIB.addImm(0);
4394   }
4395 }
4396 
4397 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4398                                    MachineBasicBlock::iterator I,
4399                                    const DebugLoc &DL, MCRegister DestReg,
4400                                    MCRegister SrcReg, bool KillSrc) const {
4401   if (AArch64::GPR32spRegClass.contains(DestReg) &&
4402       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4403     const TargetRegisterInfo *TRI = &getRegisterInfo();
4404 
4405     if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4406       // If either operand is WSP, expand to ADD #0.
4407       if (Subtarget.hasZeroCycleRegMove()) {
4408         // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4409         MCRegister DestRegX = TRI->getMatchingSuperReg(
4410             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4411         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4412             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4413         // This instruction is reading and writing X registers.  This may upset
4414         // the register scavenger and machine verifier, so we need to indicate
4415         // that we are reading an undefined value from SrcRegX, but a proper
4416         // value from SrcReg.
4417         BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4418             .addReg(SrcRegX, RegState::Undef)
4419             .addImm(0)
4420             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4421             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4422       } else {
4423         BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4424             .addReg(SrcReg, getKillRegState(KillSrc))
4425             .addImm(0)
4426             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4427       }
4428     } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4429       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4430           .addImm(0)
4431           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4432     } else {
4433       if (Subtarget.hasZeroCycleRegMove()) {
4434         // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4435         MCRegister DestRegX = TRI->getMatchingSuperReg(
4436             DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4437         MCRegister SrcRegX = TRI->getMatchingSuperReg(
4438             SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4439         // This instruction is reading and writing X registers.  This may upset
4440         // the register scavenger and machine verifier, so we need to indicate
4441         // that we are reading an undefined value from SrcRegX, but a proper
4442         // value from SrcReg.
4443         BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4444             .addReg(AArch64::XZR)
4445             .addReg(SrcRegX, RegState::Undef)
4446             .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4447       } else {
4448         // Otherwise, expand to ORR WZR.
4449         BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4450             .addReg(AArch64::WZR)
4451             .addReg(SrcReg, getKillRegState(KillSrc));
4452       }
4453     }
4454     return;
4455   }
4456 
4457   // Copy a Predicate register by ORRing with itself.
4458   if (AArch64::PPRRegClass.contains(DestReg) &&
4459       AArch64::PPRRegClass.contains(SrcReg)) {
4460     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4461     BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4462       .addReg(SrcReg) // Pg
4463       .addReg(SrcReg)
4464       .addReg(SrcReg, getKillRegState(KillSrc));
4465     return;
4466   }
4467 
4468   // Copy a predicate-as-counter register by ORRing with itself as if it
4469   // were a regular predicate (mask) register.
4470   bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4471   bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4472   if (DestIsPNR || SrcIsPNR) {
4473     assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4474            "Unexpected predicate-as-counter register.");
4475     auto ToPPR = [](MCRegister R) -> MCRegister {
4476       return (R - AArch64::PN0) + AArch64::P0;
4477     };
4478     MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4479     MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4480 
4481     if (PPRSrcReg != PPRDestReg) {
4482       auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4483                        .addReg(PPRSrcReg) // Pg
4484                        .addReg(PPRSrcReg)
4485                        .addReg(PPRSrcReg, getKillRegState(KillSrc));
4486       if (DestIsPNR)
4487         NewMI.addDef(DestReg, RegState::Implicit);
4488     }
4489     return;
4490   }
4491 
4492   // Copy a Z register by ORRing with itself.
4493   if (AArch64::ZPRRegClass.contains(DestReg) &&
4494       AArch64::ZPRRegClass.contains(SrcReg)) {
4495     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4496     BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4497       .addReg(SrcReg)
4498       .addReg(SrcReg, getKillRegState(KillSrc));
4499     return;
4500   }
4501 
4502   // Copy a Z register pair by copying the individual sub-registers.
4503   if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4504        AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4505       (AArch64::ZPR2RegClass.contains(SrcReg) ||
4506        AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4507     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4508     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4509     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4510                      Indices);
4511     return;
4512   }
4513 
4514   // Copy a Z register triple by copying the individual sub-registers.
4515   if (AArch64::ZPR3RegClass.contains(DestReg) &&
4516       AArch64::ZPR3RegClass.contains(SrcReg)) {
4517     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4518     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4519                                        AArch64::zsub2};
4520     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4521                      Indices);
4522     return;
4523   }
4524 
4525   // Copy a Z register quad by copying the individual sub-registers.
4526   if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4527        AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4528       (AArch64::ZPR4RegClass.contains(SrcReg) ||
4529        AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4530     assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4531     static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4532                                        AArch64::zsub2, AArch64::zsub3};
4533     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4534                      Indices);
4535     return;
4536   }
4537 
4538   if (AArch64::GPR64spRegClass.contains(DestReg) &&
4539       (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4540     if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4541       // If either operand is SP, expand to ADD #0.
4542       BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4543           .addReg(SrcReg, getKillRegState(KillSrc))
4544           .addImm(0)
4545           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4546     } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4547       BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4548           .addImm(0)
4549           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4550     } else {
4551       // Otherwise, expand to ORR XZR.
4552       BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4553           .addReg(AArch64::XZR)
4554           .addReg(SrcReg, getKillRegState(KillSrc));
4555     }
4556     return;
4557   }
4558 
4559   // Copy a DDDD register quad by copying the individual sub-registers.
4560   if (AArch64::DDDDRegClass.contains(DestReg) &&
4561       AArch64::DDDDRegClass.contains(SrcReg)) {
4562     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4563                                        AArch64::dsub2, AArch64::dsub3};
4564     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4565                      Indices);
4566     return;
4567   }
4568 
4569   // Copy a DDD register triple by copying the individual sub-registers.
4570   if (AArch64::DDDRegClass.contains(DestReg) &&
4571       AArch64::DDDRegClass.contains(SrcReg)) {
4572     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4573                                        AArch64::dsub2};
4574     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4575                      Indices);
4576     return;
4577   }
4578 
4579   // Copy a DD register pair by copying the individual sub-registers.
4580   if (AArch64::DDRegClass.contains(DestReg) &&
4581       AArch64::DDRegClass.contains(SrcReg)) {
4582     static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4583     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4584                      Indices);
4585     return;
4586   }
4587 
4588   // Copy a QQQQ register quad by copying the individual sub-registers.
4589   if (AArch64::QQQQRegClass.contains(DestReg) &&
4590       AArch64::QQQQRegClass.contains(SrcReg)) {
4591     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4592                                        AArch64::qsub2, AArch64::qsub3};
4593     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4594                      Indices);
4595     return;
4596   }
4597 
4598   // Copy a QQQ register triple by copying the individual sub-registers.
4599   if (AArch64::QQQRegClass.contains(DestReg) &&
4600       AArch64::QQQRegClass.contains(SrcReg)) {
4601     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4602                                        AArch64::qsub2};
4603     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4604                      Indices);
4605     return;
4606   }
4607 
4608   // Copy a QQ register pair by copying the individual sub-registers.
4609   if (AArch64::QQRegClass.contains(DestReg) &&
4610       AArch64::QQRegClass.contains(SrcReg)) {
4611     static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4612     copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4613                      Indices);
4614     return;
4615   }
4616 
4617   if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4618       AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4619     static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4620     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4621                     AArch64::XZR, Indices);
4622     return;
4623   }
4624 
4625   if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4626       AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4627     static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4628     copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4629                     AArch64::WZR, Indices);
4630     return;
4631   }
4632 
4633   if (AArch64::FPR128RegClass.contains(DestReg) &&
4634       AArch64::FPR128RegClass.contains(SrcReg)) {
4635     if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
4636       BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4637           .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4638           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4639           .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4640     else if (Subtarget.hasNEON())
4641       BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4642           .addReg(SrcReg)
4643           .addReg(SrcReg, getKillRegState(KillSrc));
4644     else {
4645       BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4646           .addReg(AArch64::SP, RegState::Define)
4647           .addReg(SrcReg, getKillRegState(KillSrc))
4648           .addReg(AArch64::SP)
4649           .addImm(-16);
4650       BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
4651           .addReg(AArch64::SP, RegState::Define)
4652           .addReg(DestReg, RegState::Define)
4653           .addReg(AArch64::SP)
4654           .addImm(16);
4655     }
4656     return;
4657   }
4658 
4659   if (AArch64::FPR64RegClass.contains(DestReg) &&
4660       AArch64::FPR64RegClass.contains(SrcReg)) {
4661     BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4662         .addReg(SrcReg, getKillRegState(KillSrc));
4663     return;
4664   }
4665 
4666   if (AArch64::FPR32RegClass.contains(DestReg) &&
4667       AArch64::FPR32RegClass.contains(SrcReg)) {
4668     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4669         .addReg(SrcReg, getKillRegState(KillSrc));
4670     return;
4671   }
4672 
4673   if (AArch64::FPR16RegClass.contains(DestReg) &&
4674       AArch64::FPR16RegClass.contains(SrcReg)) {
4675     DestReg =
4676         RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4677     SrcReg =
4678         RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4679     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4680         .addReg(SrcReg, getKillRegState(KillSrc));
4681     return;
4682   }
4683 
4684   if (AArch64::FPR8RegClass.contains(DestReg) &&
4685       AArch64::FPR8RegClass.contains(SrcReg)) {
4686     DestReg =
4687         RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4688     SrcReg =
4689         RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4690     BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4691         .addReg(SrcReg, getKillRegState(KillSrc));
4692     return;
4693   }
4694 
4695   // Copies between GPR64 and FPR64.
4696   if (AArch64::FPR64RegClass.contains(DestReg) &&
4697       AArch64::GPR64RegClass.contains(SrcReg)) {
4698     BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4699         .addReg(SrcReg, getKillRegState(KillSrc));
4700     return;
4701   }
4702   if (AArch64::GPR64RegClass.contains(DestReg) &&
4703       AArch64::FPR64RegClass.contains(SrcReg)) {
4704     BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4705         .addReg(SrcReg, getKillRegState(KillSrc));
4706     return;
4707   }
4708   // Copies between GPR32 and FPR32.
4709   if (AArch64::FPR32RegClass.contains(DestReg) &&
4710       AArch64::GPR32RegClass.contains(SrcReg)) {
4711     BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4712         .addReg(SrcReg, getKillRegState(KillSrc));
4713     return;
4714   }
4715   if (AArch64::GPR32RegClass.contains(DestReg) &&
4716       AArch64::FPR32RegClass.contains(SrcReg)) {
4717     BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4718         .addReg(SrcReg, getKillRegState(KillSrc));
4719     return;
4720   }
4721 
4722   if (DestReg == AArch64::NZCV) {
4723     assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4724     BuildMI(MBB, I, DL, get(AArch64::MSR))
4725         .addImm(AArch64SysReg::NZCV)
4726         .addReg(SrcReg, getKillRegState(KillSrc))
4727         .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4728     return;
4729   }
4730 
4731   if (SrcReg == AArch64::NZCV) {
4732     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4733     BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4734         .addImm(AArch64SysReg::NZCV)
4735         .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4736     return;
4737   }
4738 
4739 #ifndef NDEBUG
4740   const TargetRegisterInfo &TRI = getRegisterInfo();
4741   errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4742          << TRI.getRegAsmName(SrcReg) << "\n";
4743 #endif
4744   llvm_unreachable("unimplemented reg-to-reg copy");
4745 }
4746 
4747 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4748                                     MachineBasicBlock &MBB,
4749                                     MachineBasicBlock::iterator InsertBefore,
4750                                     const MCInstrDesc &MCID,
4751                                     Register SrcReg, bool IsKill,
4752                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
4753                                     MachineMemOperand *MMO) {
4754   Register SrcReg0 = SrcReg;
4755   Register SrcReg1 = SrcReg;
4756   if (SrcReg.isPhysical()) {
4757     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4758     SubIdx0 = 0;
4759     SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4760     SubIdx1 = 0;
4761   }
4762   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4763       .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4764       .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4765       .addFrameIndex(FI)
4766       .addImm(0)
4767       .addMemOperand(MMO);
4768 }
4769 
4770 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4771                                            MachineBasicBlock::iterator MBBI,
4772                                            Register SrcReg, bool isKill, int FI,
4773                                            const TargetRegisterClass *RC,
4774                                            const TargetRegisterInfo *TRI,
4775                                            Register VReg) const {
4776   MachineFunction &MF = *MBB.getParent();
4777   MachineFrameInfo &MFI = MF.getFrameInfo();
4778 
4779   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4780   MachineMemOperand *MMO =
4781       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4782                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4783   unsigned Opc = 0;
4784   bool Offset = true;
4785   MCRegister PNRReg = MCRegister::NoRegister;
4786   unsigned StackID = TargetStackID::Default;
4787   switch (TRI->getSpillSize(*RC)) {
4788   case 1:
4789     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4790       Opc = AArch64::STRBui;
4791     break;
4792   case 2:
4793     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4794       Opc = AArch64::STRHui;
4795     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4796       assert(Subtarget.hasSVEorSME() &&
4797              "Unexpected register store without SVE store instructions");
4798       Opc = AArch64::STR_PXI;
4799       StackID = TargetStackID::ScalableVector;
4800     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4801       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4802              "Unexpected register store without SVE2p1 or SME2");
4803       if (SrcReg.isVirtual()) {
4804         auto NewSrcReg =
4805             MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4806         BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg)
4807             .addReg(SrcReg);
4808         SrcReg = NewSrcReg;
4809       } else
4810         SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
4811       Opc = AArch64::STR_PXI;
4812       StackID = TargetStackID::ScalableVector;
4813     }
4814     break;
4815   case 4:
4816     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4817       Opc = AArch64::STRWui;
4818       if (SrcReg.isVirtual())
4819         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4820       else
4821         assert(SrcReg != AArch64::WSP);
4822     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4823       Opc = AArch64::STRSui;
4824     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4825       Opc = AArch64::STR_PPXI;
4826       StackID = TargetStackID::ScalableVector;
4827     }
4828     break;
4829   case 8:
4830     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4831       Opc = AArch64::STRXui;
4832       if (SrcReg.isVirtual())
4833         MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4834       else
4835         assert(SrcReg != AArch64::SP);
4836     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4837       Opc = AArch64::STRDui;
4838     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4839       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4840                               get(AArch64::STPWi), SrcReg, isKill,
4841                               AArch64::sube32, AArch64::subo32, FI, MMO);
4842       return;
4843     }
4844     break;
4845   case 16:
4846     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4847       Opc = AArch64::STRQui;
4848     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4849       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4850       Opc = AArch64::ST1Twov1d;
4851       Offset = false;
4852     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4853       storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4854                               get(AArch64::STPXi), SrcReg, isKill,
4855                               AArch64::sube64, AArch64::subo64, FI, MMO);
4856       return;
4857     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4858       assert(Subtarget.hasSVEorSME() &&
4859              "Unexpected register store without SVE store instructions");
4860       Opc = AArch64::STR_ZXI;
4861       StackID = TargetStackID::ScalableVector;
4862     }
4863     break;
4864   case 24:
4865     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4866       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4867       Opc = AArch64::ST1Threev1d;
4868       Offset = false;
4869     }
4870     break;
4871   case 32:
4872     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4873       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4874       Opc = AArch64::ST1Fourv1d;
4875       Offset = false;
4876     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4877       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4878       Opc = AArch64::ST1Twov2d;
4879       Offset = false;
4880     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4881                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4882       assert(Subtarget.hasSVEorSME() &&
4883              "Unexpected register store without SVE store instructions");
4884       Opc = AArch64::STR_ZZXI;
4885       StackID = TargetStackID::ScalableVector;
4886     }
4887     break;
4888   case 48:
4889     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4890       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4891       Opc = AArch64::ST1Threev2d;
4892       Offset = false;
4893     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4894       assert(Subtarget.hasSVEorSME() &&
4895              "Unexpected register store without SVE store instructions");
4896       Opc = AArch64::STR_ZZZXI;
4897       StackID = TargetStackID::ScalableVector;
4898     }
4899     break;
4900   case 64:
4901     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4902       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4903       Opc = AArch64::ST1Fourv2d;
4904       Offset = false;
4905     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4906                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4907       assert(Subtarget.hasSVEorSME() &&
4908              "Unexpected register store without SVE store instructions");
4909       Opc = AArch64::STR_ZZZZXI;
4910       StackID = TargetStackID::ScalableVector;
4911     }
4912     break;
4913   }
4914   assert(Opc && "Unknown register class");
4915   MFI.setStackID(FI, StackID);
4916 
4917   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4918                                      .addReg(SrcReg, getKillRegState(isKill))
4919                                      .addFrameIndex(FI);
4920 
4921   if (Offset)
4922     MI.addImm(0);
4923   if (PNRReg.isValid())
4924     MI.addDef(PNRReg, RegState::Implicit);
4925   MI.addMemOperand(MMO);
4926 }
4927 
4928 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4929                                      MachineBasicBlock &MBB,
4930                                      MachineBasicBlock::iterator InsertBefore,
4931                                      const MCInstrDesc &MCID,
4932                                      Register DestReg, unsigned SubIdx0,
4933                                      unsigned SubIdx1, int FI,
4934                                      MachineMemOperand *MMO) {
4935   Register DestReg0 = DestReg;
4936   Register DestReg1 = DestReg;
4937   bool IsUndef = true;
4938   if (DestReg.isPhysical()) {
4939     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4940     SubIdx0 = 0;
4941     DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4942     SubIdx1 = 0;
4943     IsUndef = false;
4944   }
4945   BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4946       .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4947       .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4948       .addFrameIndex(FI)
4949       .addImm(0)
4950       .addMemOperand(MMO);
4951 }
4952 
4953 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
4954                                             MachineBasicBlock::iterator MBBI,
4955                                             Register DestReg, int FI,
4956                                             const TargetRegisterClass *RC,
4957                                             const TargetRegisterInfo *TRI,
4958                                             Register VReg) const {
4959   MachineFunction &MF = *MBB.getParent();
4960   MachineFrameInfo &MFI = MF.getFrameInfo();
4961   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4962   MachineMemOperand *MMO =
4963       MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
4964                               MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4965 
4966   unsigned Opc = 0;
4967   bool Offset = true;
4968   unsigned StackID = TargetStackID::Default;
4969   Register PNRReg = MCRegister::NoRegister;
4970   switch (TRI->getSpillSize(*RC)) {
4971   case 1:
4972     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4973       Opc = AArch64::LDRBui;
4974     break;
4975   case 2:
4976     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4977       Opc = AArch64::LDRHui;
4978     else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
4979       assert(Subtarget.hasSVEorSME() &&
4980              "Unexpected register load without SVE load instructions");
4981       Opc = AArch64::LDR_PXI;
4982       StackID = TargetStackID::ScalableVector;
4983     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
4984       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4985              "Unexpected register load without SVE2p1 or SME2");
4986       PNRReg = DestReg;
4987       if (DestReg.isVirtual())
4988         DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
4989       else
4990         DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
4991       Opc = AArch64::LDR_PXI;
4992       StackID = TargetStackID::ScalableVector;
4993     }
4994     break;
4995   case 4:
4996     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4997       Opc = AArch64::LDRWui;
4998       if (DestReg.isVirtual())
4999         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5000       else
5001         assert(DestReg != AArch64::WSP);
5002     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5003       Opc = AArch64::LDRSui;
5004     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5005       Opc = AArch64::LDR_PPXI;
5006       StackID = TargetStackID::ScalableVector;
5007     }
5008     break;
5009   case 8:
5010     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5011       Opc = AArch64::LDRXui;
5012       if (DestReg.isVirtual())
5013         MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5014       else
5015         assert(DestReg != AArch64::SP);
5016     } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5017       Opc = AArch64::LDRDui;
5018     } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5019       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5020                                get(AArch64::LDPWi), DestReg, AArch64::sube32,
5021                                AArch64::subo32, FI, MMO);
5022       return;
5023     }
5024     break;
5025   case 16:
5026     if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5027       Opc = AArch64::LDRQui;
5028     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5029       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5030       Opc = AArch64::LD1Twov1d;
5031       Offset = false;
5032     } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5033       loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5034                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
5035                                AArch64::subo64, FI, MMO);
5036       return;
5037     } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5038       assert(Subtarget.hasSVEorSME() &&
5039              "Unexpected register load without SVE load instructions");
5040       Opc = AArch64::LDR_ZXI;
5041       StackID = TargetStackID::ScalableVector;
5042     }
5043     break;
5044   case 24:
5045     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5046       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5047       Opc = AArch64::LD1Threev1d;
5048       Offset = false;
5049     }
5050     break;
5051   case 32:
5052     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5053       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5054       Opc = AArch64::LD1Fourv1d;
5055       Offset = false;
5056     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5057       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5058       Opc = AArch64::LD1Twov2d;
5059       Offset = false;
5060     } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5061                AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5062       assert(Subtarget.hasSVEorSME() &&
5063              "Unexpected register load without SVE load instructions");
5064       Opc = AArch64::LDR_ZZXI;
5065       StackID = TargetStackID::ScalableVector;
5066     }
5067     break;
5068   case 48:
5069     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5070       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071       Opc = AArch64::LD1Threev2d;
5072       Offset = false;
5073     } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5074       assert(Subtarget.hasSVEorSME() &&
5075              "Unexpected register load without SVE load instructions");
5076       Opc = AArch64::LDR_ZZZXI;
5077       StackID = TargetStackID::ScalableVector;
5078     }
5079     break;
5080   case 64:
5081     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5082       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5083       Opc = AArch64::LD1Fourv2d;
5084       Offset = false;
5085     } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5086                AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5087       assert(Subtarget.hasSVEorSME() &&
5088              "Unexpected register load without SVE load instructions");
5089       Opc = AArch64::LDR_ZZZZXI;
5090       StackID = TargetStackID::ScalableVector;
5091     }
5092     break;
5093   }
5094 
5095   assert(Opc && "Unknown register class");
5096   MFI.setStackID(FI, StackID);
5097 
5098   const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5099                                      .addReg(DestReg, getDefRegState(true))
5100                                      .addFrameIndex(FI);
5101   if (Offset)
5102     MI.addImm(0);
5103   if (PNRReg.isValid() && !PNRReg.isVirtual())
5104     MI.addDef(PNRReg, RegState::Implicit);
5105   MI.addMemOperand(MMO);
5106 
5107   if (PNRReg.isValid() && PNRReg.isVirtual())
5108     BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
5109         .addReg(DestReg);
5110 }
5111 
5112 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5113                                            const MachineInstr &UseMI,
5114                                            const TargetRegisterInfo *TRI) {
5115   return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5116                                          UseMI.getIterator()),
5117                 [TRI](const MachineInstr &I) {
5118                   return I.modifiesRegister(AArch64::NZCV, TRI) ||
5119                          I.readsRegister(AArch64::NZCV, TRI);
5120                 });
5121 }
5122 
5123 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5124     const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5125   // The smallest scalable element supported by scaled SVE addressing
5126   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5127   // byte offset must always be a multiple of 2.
5128   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5129 
5130   // VGSized offsets are divided by '2', because the VG register is the
5131   // the number of 64bit granules as opposed to 128bit vector chunks,
5132   // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5133   // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5134   // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5135   ByteSized = Offset.getFixed();
5136   VGSized = Offset.getScalable() / 2;
5137 }
5138 
5139 /// Returns the offset in parts to which this frame offset can be
5140 /// decomposed for the purpose of describing a frame offset.
5141 /// For non-scalable offsets this is simply its byte size.
5142 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5143     const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5144     int64_t &NumDataVectors) {
5145   // The smallest scalable element supported by scaled SVE addressing
5146   // modes are predicates, which are 2 scalable bytes in size. So the scalable
5147   // byte offset must always be a multiple of 2.
5148   assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5149 
5150   NumBytes = Offset.getFixed();
5151   NumDataVectors = 0;
5152   NumPredicateVectors = Offset.getScalable() / 2;
5153   // This method is used to get the offsets to adjust the frame offset.
5154   // If the function requires ADDPL to be used and needs more than two ADDPL
5155   // instructions, part of the offset is folded into NumDataVectors so that it
5156   // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5157   if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5158       NumPredicateVectors > 62) {
5159     NumDataVectors = NumPredicateVectors / 8;
5160     NumPredicateVectors -= NumDataVectors * 8;
5161   }
5162 }
5163 
5164 // Convenience function to create a DWARF expression for
5165 //   Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5166 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5167                                      int NumVGScaledBytes, unsigned VG,
5168                                      llvm::raw_string_ostream &Comment) {
5169   uint8_t buffer[16];
5170 
5171   if (NumBytes) {
5172     Expr.push_back(dwarf::DW_OP_consts);
5173     Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5174     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5175     Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5176   }
5177 
5178   if (NumVGScaledBytes) {
5179     Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5180     Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5181 
5182     Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5183     Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5184     Expr.push_back(0);
5185 
5186     Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5187     Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5188 
5189     Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5190             << std::abs(NumVGScaledBytes) << " * VG";
5191   }
5192 }
5193 
5194 // Creates an MCCFIInstruction:
5195 //    { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5196 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5197                                                unsigned Reg,
5198                                                const StackOffset &Offset) {
5199   int64_t NumBytes, NumVGScaledBytes;
5200   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5201                                                         NumVGScaledBytes);
5202   std::string CommentBuffer;
5203   llvm::raw_string_ostream Comment(CommentBuffer);
5204 
5205   if (Reg == AArch64::SP)
5206     Comment << "sp";
5207   else if (Reg == AArch64::FP)
5208     Comment << "fp";
5209   else
5210     Comment << printReg(Reg, &TRI);
5211 
5212   // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5213   SmallString<64> Expr;
5214   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5215   Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5216   Expr.push_back(0);
5217   appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5218                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5219 
5220   // Wrap this into DW_CFA_def_cfa.
5221   SmallString<64> DefCfaExpr;
5222   DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5223   uint8_t buffer[16];
5224   DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5225   DefCfaExpr.append(Expr.str());
5226   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5227                                         Comment.str());
5228 }
5229 
5230 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5231                                     unsigned FrameReg, unsigned Reg,
5232                                     const StackOffset &Offset,
5233                                     bool LastAdjustmentWasScalable) {
5234   if (Offset.getScalable())
5235     return createDefCFAExpression(TRI, Reg, Offset);
5236 
5237   if (FrameReg == Reg && !LastAdjustmentWasScalable)
5238     return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5239 
5240   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5241   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5242 }
5243 
5244 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5245                                        unsigned Reg,
5246                                        const StackOffset &OffsetFromDefCFA) {
5247   int64_t NumBytes, NumVGScaledBytes;
5248   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5249       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5250 
5251   unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5252 
5253   // Non-scalable offsets can use DW_CFA_offset directly.
5254   if (!NumVGScaledBytes)
5255     return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5256 
5257   std::string CommentBuffer;
5258   llvm::raw_string_ostream Comment(CommentBuffer);
5259   Comment << printReg(Reg, &TRI) << "  @ cfa";
5260 
5261   // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5262   SmallString<64> OffsetExpr;
5263   appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5264                            TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5265 
5266   // Wrap this into DW_CFA_expression
5267   SmallString<64> CfaExpr;
5268   CfaExpr.push_back(dwarf::DW_CFA_expression);
5269   uint8_t buffer[16];
5270   CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5271   CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5272   CfaExpr.append(OffsetExpr.str());
5273 
5274   return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5275                                         Comment.str());
5276 }
5277 
5278 // Helper function to emit a frame offset adjustment from a given
5279 // pointer (SrcReg), stored into DestReg. This function is explicit
5280 // in that it requires the opcode.
5281 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5282                                MachineBasicBlock::iterator MBBI,
5283                                const DebugLoc &DL, unsigned DestReg,
5284                                unsigned SrcReg, int64_t Offset, unsigned Opc,
5285                                const TargetInstrInfo *TII,
5286                                MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5287                                bool *HasWinCFI, bool EmitCFAOffset,
5288                                StackOffset CFAOffset, unsigned FrameReg) {
5289   int Sign = 1;
5290   unsigned MaxEncoding, ShiftSize;
5291   switch (Opc) {
5292   case AArch64::ADDXri:
5293   case AArch64::ADDSXri:
5294   case AArch64::SUBXri:
5295   case AArch64::SUBSXri:
5296     MaxEncoding = 0xfff;
5297     ShiftSize = 12;
5298     break;
5299   case AArch64::ADDVL_XXI:
5300   case AArch64::ADDPL_XXI:
5301   case AArch64::ADDSVL_XXI:
5302   case AArch64::ADDSPL_XXI:
5303     MaxEncoding = 31;
5304     ShiftSize = 0;
5305     if (Offset < 0) {
5306       MaxEncoding = 32;
5307       Sign = -1;
5308       Offset = -Offset;
5309     }
5310     break;
5311   default:
5312     llvm_unreachable("Unsupported opcode");
5313   }
5314 
5315   // `Offset` can be in bytes or in "scalable bytes".
5316   int VScale = 1;
5317   if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5318     VScale = 16;
5319   else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5320     VScale = 2;
5321 
5322   // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5323   // scratch register.  If DestReg is a virtual register, use it as the
5324   // scratch register; otherwise, create a new virtual register (to be
5325   // replaced by the scavenger at the end of PEI).  That case can be optimized
5326   // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5327   // register can be loaded with offset%8 and the add/sub can use an extending
5328   // instruction with LSL#3.
5329   // Currently the function handles any offsets but generates a poor sequence
5330   // of code.
5331   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5332 
5333   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5334   Register TmpReg = DestReg;
5335   if (TmpReg == AArch64::XZR)
5336     TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5337         &AArch64::GPR64RegClass);
5338   do {
5339     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5340     unsigned LocalShiftSize = 0;
5341     if (ThisVal > MaxEncoding) {
5342       ThisVal = ThisVal >> ShiftSize;
5343       LocalShiftSize = ShiftSize;
5344     }
5345     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5346            "Encoding cannot handle value that big");
5347 
5348     Offset -= ThisVal << LocalShiftSize;
5349     if (Offset == 0)
5350       TmpReg = DestReg;
5351     auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5352                    .addReg(SrcReg)
5353                    .addImm(Sign * (int)ThisVal);
5354     if (ShiftSize)
5355       MBI = MBI.addImm(
5356           AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5357     MBI = MBI.setMIFlag(Flag);
5358 
5359     auto Change =
5360         VScale == 1
5361             ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5362             : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5363     if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5364       CFAOffset += Change;
5365     else
5366       CFAOffset -= Change;
5367     if (EmitCFAOffset && DestReg == TmpReg) {
5368       MachineFunction &MF = *MBB.getParent();
5369       const TargetSubtargetInfo &STI = MF.getSubtarget();
5370       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5371 
5372       unsigned CFIIndex = MF.addFrameInst(
5373           createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5374       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5375           .addCFIIndex(CFIIndex)
5376           .setMIFlags(Flag);
5377     }
5378 
5379     if (NeedsWinCFI) {
5380       assert(Sign == 1 && "SEH directives should always have a positive sign");
5381       int Imm = (int)(ThisVal << LocalShiftSize);
5382       if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5383           (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5384         if (HasWinCFI)
5385           *HasWinCFI = true;
5386         if (Imm == 0)
5387           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5388         else
5389           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5390               .addImm(Imm)
5391               .setMIFlag(Flag);
5392         assert(Offset == 0 && "Expected remaining offset to be zero to "
5393                               "emit a single SEH directive");
5394       } else if (DestReg == AArch64::SP) {
5395         if (HasWinCFI)
5396           *HasWinCFI = true;
5397         assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5398         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5399             .addImm(Imm)
5400             .setMIFlag(Flag);
5401       }
5402     }
5403 
5404     SrcReg = TmpReg;
5405   } while (Offset);
5406 }
5407 
5408 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5409                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5410                            unsigned DestReg, unsigned SrcReg,
5411                            StackOffset Offset, const TargetInstrInfo *TII,
5412                            MachineInstr::MIFlag Flag, bool SetNZCV,
5413                            bool NeedsWinCFI, bool *HasWinCFI,
5414                            bool EmitCFAOffset, StackOffset CFAOffset,
5415                            unsigned FrameReg) {
5416   // If a function is marked as arm_locally_streaming, then the runtime value of
5417   // vscale in the prologue/epilogue is different the runtime value of vscale
5418   // in the function's body. To avoid having to consider multiple vscales,
5419   // we can use `addsvl` to allocate any scalable stack-slots, which under
5420   // most circumstances will be only locals, not callee-save slots.
5421   const Function &F = MBB.getParent()->getFunction();
5422   bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5423 
5424   int64_t Bytes, NumPredicateVectors, NumDataVectors;
5425   AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5426       Offset, Bytes, NumPredicateVectors, NumDataVectors);
5427 
5428   // First emit non-scalable frame offsets, or a simple 'mov'.
5429   if (Bytes || (!Offset && SrcReg != DestReg)) {
5430     assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5431            "SP increment/decrement not 8-byte aligned");
5432     unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5433     if (Bytes < 0) {
5434       Bytes = -Bytes;
5435       Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5436     }
5437     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5438                        NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5439                        FrameReg);
5440     CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5441                      ? StackOffset::getFixed(-Bytes)
5442                      : StackOffset::getFixed(Bytes);
5443     SrcReg = DestReg;
5444     FrameReg = DestReg;
5445   }
5446 
5447   assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5448          "SetNZCV not supported with SVE vectors");
5449   assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5450          "WinCFI not supported with SVE vectors");
5451 
5452   if (NumDataVectors) {
5453     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5454                        UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5455                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5456                        CFAOffset, FrameReg);
5457     CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5458     SrcReg = DestReg;
5459   }
5460 
5461   if (NumPredicateVectors) {
5462     assert(DestReg != AArch64::SP && "Unaligned access to SP");
5463     emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5464                        UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5465                        TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5466                        CFAOffset, FrameReg);
5467   }
5468 }
5469 
5470 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5471     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5472     MachineBasicBlock::iterator InsertPt, int FrameIndex,
5473     LiveIntervals *LIS, VirtRegMap *VRM) const {
5474   // This is a bit of a hack. Consider this instruction:
5475   //
5476   //   %0 = COPY %sp; GPR64all:%0
5477   //
5478   // We explicitly chose GPR64all for the virtual register so such a copy might
5479   // be eliminated by RegisterCoalescer. However, that may not be possible, and
5480   // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5481   // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5482   //
5483   // To prevent that, we are going to constrain the %0 register class here.
5484   if (MI.isFullCopy()) {
5485     Register DstReg = MI.getOperand(0).getReg();
5486     Register SrcReg = MI.getOperand(1).getReg();
5487     if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5488       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5489       return nullptr;
5490     }
5491     if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5492       MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5493       return nullptr;
5494     }
5495     // Nothing can folded with copy from/to NZCV.
5496     if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5497       return nullptr;
5498   }
5499 
5500   // Handle the case where a copy is being spilled or filled but the source
5501   // and destination register class don't match.  For example:
5502   //
5503   //   %0 = COPY %xzr; GPR64common:%0
5504   //
5505   // In this case we can still safely fold away the COPY and generate the
5506   // following spill code:
5507   //
5508   //   STRXui %xzr, %stack.0
5509   //
5510   // This also eliminates spilled cross register class COPYs (e.g. between x and
5511   // d regs) of the same size.  For example:
5512   //
5513   //   %0 = COPY %1; GPR64:%0, FPR64:%1
5514   //
5515   // will be filled as
5516   //
5517   //   LDRDui %0, fi<#0>
5518   //
5519   // instead of
5520   //
5521   //   LDRXui %Temp, fi<#0>
5522   //   %0 = FMOV %Temp
5523   //
5524   if (MI.isCopy() && Ops.size() == 1 &&
5525       // Make sure we're only folding the explicit COPY defs/uses.
5526       (Ops[0] == 0 || Ops[0] == 1)) {
5527     bool IsSpill = Ops[0] == 0;
5528     bool IsFill = !IsSpill;
5529     const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5530     const MachineRegisterInfo &MRI = MF.getRegInfo();
5531     MachineBasicBlock &MBB = *MI.getParent();
5532     const MachineOperand &DstMO = MI.getOperand(0);
5533     const MachineOperand &SrcMO = MI.getOperand(1);
5534     Register DstReg = DstMO.getReg();
5535     Register SrcReg = SrcMO.getReg();
5536     // This is slightly expensive to compute for physical regs since
5537     // getMinimalPhysRegClass is slow.
5538     auto getRegClass = [&](unsigned Reg) {
5539       return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5540                                               : TRI.getMinimalPhysRegClass(Reg);
5541     };
5542 
5543     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5544       assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5545                  TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5546              "Mismatched register size in non subreg COPY");
5547       if (IsSpill)
5548         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5549                             getRegClass(SrcReg), &TRI, Register());
5550       else
5551         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5552                              getRegClass(DstReg), &TRI, Register());
5553       return &*--InsertPt;
5554     }
5555 
5556     // Handle cases like spilling def of:
5557     //
5558     //   %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5559     //
5560     // where the physical register source can be widened and stored to the full
5561     // virtual reg destination stack slot, in this case producing:
5562     //
5563     //   STRXui %xzr, %stack.0
5564     //
5565     if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5566         TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5567       assert(SrcMO.getSubReg() == 0 &&
5568              "Unexpected subreg on physical register");
5569       storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5570                           FrameIndex, &AArch64::GPR64RegClass, &TRI,
5571                           Register());
5572       return &*--InsertPt;
5573     }
5574 
5575     // Handle cases like filling use of:
5576     //
5577     //   %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5578     //
5579     // where we can load the full virtual reg source stack slot, into the subreg
5580     // destination, in this case producing:
5581     //
5582     //   LDRWui %0:sub_32<def,read-undef>, %stack.0
5583     //
5584     if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5585       const TargetRegisterClass *FillRC;
5586       switch (DstMO.getSubReg()) {
5587       default:
5588         FillRC = nullptr;
5589         break;
5590       case AArch64::sub_32:
5591         FillRC = &AArch64::GPR32RegClass;
5592         break;
5593       case AArch64::ssub:
5594         FillRC = &AArch64::FPR32RegClass;
5595         break;
5596       case AArch64::dsub:
5597         FillRC = &AArch64::FPR64RegClass;
5598         break;
5599       }
5600 
5601       if (FillRC) {
5602         assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5603                    TRI.getRegSizeInBits(*FillRC) &&
5604                "Mismatched regclass size on folded subreg COPY");
5605         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5606                              Register());
5607         MachineInstr &LoadMI = *--InsertPt;
5608         MachineOperand &LoadDst = LoadMI.getOperand(0);
5609         assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5610         LoadDst.setSubReg(DstMO.getSubReg());
5611         LoadDst.setIsUndef();
5612         return &LoadMI;
5613       }
5614     }
5615   }
5616 
5617   // Cannot fold.
5618   return nullptr;
5619 }
5620 
5621 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5622                                     StackOffset &SOffset,
5623                                     bool *OutUseUnscaledOp,
5624                                     unsigned *OutUnscaledOp,
5625                                     int64_t *EmittableOffset) {
5626   // Set output values in case of early exit.
5627   if (EmittableOffset)
5628     *EmittableOffset = 0;
5629   if (OutUseUnscaledOp)
5630     *OutUseUnscaledOp = false;
5631   if (OutUnscaledOp)
5632     *OutUnscaledOp = 0;
5633 
5634   // Exit early for structured vector spills/fills as they can't take an
5635   // immediate offset.
5636   switch (MI.getOpcode()) {
5637   default:
5638     break;
5639   case AArch64::LD1Rv1d:
5640   case AArch64::LD1Rv2s:
5641   case AArch64::LD1Rv2d:
5642   case AArch64::LD1Rv4h:
5643   case AArch64::LD1Rv4s:
5644   case AArch64::LD1Rv8b:
5645   case AArch64::LD1Rv8h:
5646   case AArch64::LD1Rv16b:
5647   case AArch64::LD1Twov2d:
5648   case AArch64::LD1Threev2d:
5649   case AArch64::LD1Fourv2d:
5650   case AArch64::LD1Twov1d:
5651   case AArch64::LD1Threev1d:
5652   case AArch64::LD1Fourv1d:
5653   case AArch64::ST1Twov2d:
5654   case AArch64::ST1Threev2d:
5655   case AArch64::ST1Fourv2d:
5656   case AArch64::ST1Twov1d:
5657   case AArch64::ST1Threev1d:
5658   case AArch64::ST1Fourv1d:
5659   case AArch64::ST1i8:
5660   case AArch64::ST1i16:
5661   case AArch64::ST1i32:
5662   case AArch64::ST1i64:
5663   case AArch64::IRG:
5664   case AArch64::IRGstack:
5665   case AArch64::STGloop:
5666   case AArch64::STZGloop:
5667     return AArch64FrameOffsetCannotUpdate;
5668   }
5669 
5670   // Get the min/max offset and the scale.
5671   TypeSize ScaleValue(0U, false), Width(0U, false);
5672   int64_t MinOff, MaxOff;
5673   if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5674                                       MaxOff))
5675     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5676 
5677   // Construct the complete offset.
5678   bool IsMulVL = ScaleValue.isScalable();
5679   unsigned Scale = ScaleValue.getKnownMinValue();
5680   int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5681 
5682   const MachineOperand &ImmOpnd =
5683       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5684   Offset += ImmOpnd.getImm() * Scale;
5685 
5686   // If the offset doesn't match the scale, we rewrite the instruction to
5687   // use the unscaled instruction instead. Likewise, if we have a negative
5688   // offset and there is an unscaled op to use.
5689   std::optional<unsigned> UnscaledOp =
5690       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5691   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5692   if (useUnscaledOp &&
5693       !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5694                                       MaxOff))
5695     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5696 
5697   Scale = ScaleValue.getKnownMinValue();
5698   assert(IsMulVL == ScaleValue.isScalable() &&
5699          "Unscaled opcode has different value for scalable");
5700 
5701   int64_t Remainder = Offset % Scale;
5702   assert(!(Remainder && useUnscaledOp) &&
5703          "Cannot have remainder when using unscaled op");
5704 
5705   assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5706   int64_t NewOffset = Offset / Scale;
5707   if (MinOff <= NewOffset && NewOffset <= MaxOff)
5708     Offset = Remainder;
5709   else {
5710     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5711     Offset = Offset - NewOffset * Scale;
5712   }
5713 
5714   if (EmittableOffset)
5715     *EmittableOffset = NewOffset;
5716   if (OutUseUnscaledOp)
5717     *OutUseUnscaledOp = useUnscaledOp;
5718   if (OutUnscaledOp && UnscaledOp)
5719     *OutUnscaledOp = *UnscaledOp;
5720 
5721   if (IsMulVL)
5722     SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5723   else
5724     SOffset = StackOffset::get(Offset, SOffset.getScalable());
5725   return AArch64FrameOffsetCanUpdate |
5726          (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5727 }
5728 
5729 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5730                                     unsigned FrameReg, StackOffset &Offset,
5731                                     const AArch64InstrInfo *TII) {
5732   unsigned Opcode = MI.getOpcode();
5733   unsigned ImmIdx = FrameRegIdx + 1;
5734 
5735   if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5736     Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5737     emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5738                     MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5739                     MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5740     MI.eraseFromParent();
5741     Offset = StackOffset();
5742     return true;
5743   }
5744 
5745   int64_t NewOffset;
5746   unsigned UnscaledOp;
5747   bool UseUnscaledOp;
5748   int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5749                                          &UnscaledOp, &NewOffset);
5750   if (Status & AArch64FrameOffsetCanUpdate) {
5751     if (Status & AArch64FrameOffsetIsLegal)
5752       // Replace the FrameIndex with FrameReg.
5753       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5754     if (UseUnscaledOp)
5755       MI.setDesc(TII->get(UnscaledOp));
5756 
5757     MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5758     return !Offset;
5759   }
5760 
5761   return false;
5762 }
5763 
5764 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5765                                   MachineBasicBlock::iterator MI) const {
5766   DebugLoc DL;
5767   BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5768 }
5769 
5770 MCInst AArch64InstrInfo::getNop() const {
5771   return MCInstBuilder(AArch64::HINT).addImm(0);
5772 }
5773 
5774 // AArch64 supports MachineCombiner.
5775 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5776 
5777 // True when Opc sets flag
5778 static bool isCombineInstrSettingFlag(unsigned Opc) {
5779   switch (Opc) {
5780   case AArch64::ADDSWrr:
5781   case AArch64::ADDSWri:
5782   case AArch64::ADDSXrr:
5783   case AArch64::ADDSXri:
5784   case AArch64::SUBSWrr:
5785   case AArch64::SUBSXrr:
5786   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5787   case AArch64::SUBSWri:
5788   case AArch64::SUBSXri:
5789     return true;
5790   default:
5791     break;
5792   }
5793   return false;
5794 }
5795 
5796 // 32b Opcodes that can be combined with a MUL
5797 static bool isCombineInstrCandidate32(unsigned Opc) {
5798   switch (Opc) {
5799   case AArch64::ADDWrr:
5800   case AArch64::ADDWri:
5801   case AArch64::SUBWrr:
5802   case AArch64::ADDSWrr:
5803   case AArch64::ADDSWri:
5804   case AArch64::SUBSWrr:
5805   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5806   case AArch64::SUBWri:
5807   case AArch64::SUBSWri:
5808     return true;
5809   default:
5810     break;
5811   }
5812   return false;
5813 }
5814 
5815 // 64b Opcodes that can be combined with a MUL
5816 static bool isCombineInstrCandidate64(unsigned Opc) {
5817   switch (Opc) {
5818   case AArch64::ADDXrr:
5819   case AArch64::ADDXri:
5820   case AArch64::SUBXrr:
5821   case AArch64::ADDSXrr:
5822   case AArch64::ADDSXri:
5823   case AArch64::SUBSXrr:
5824   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5825   case AArch64::SUBXri:
5826   case AArch64::SUBSXri:
5827   case AArch64::ADDv8i8:
5828   case AArch64::ADDv16i8:
5829   case AArch64::ADDv4i16:
5830   case AArch64::ADDv8i16:
5831   case AArch64::ADDv2i32:
5832   case AArch64::ADDv4i32:
5833   case AArch64::SUBv8i8:
5834   case AArch64::SUBv16i8:
5835   case AArch64::SUBv4i16:
5836   case AArch64::SUBv8i16:
5837   case AArch64::SUBv2i32:
5838   case AArch64::SUBv4i32:
5839     return true;
5840   default:
5841     break;
5842   }
5843   return false;
5844 }
5845 
5846 // FP Opcodes that can be combined with a FMUL.
5847 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5848   switch (Inst.getOpcode()) {
5849   default:
5850     break;
5851   case AArch64::FADDHrr:
5852   case AArch64::FADDSrr:
5853   case AArch64::FADDDrr:
5854   case AArch64::FADDv4f16:
5855   case AArch64::FADDv8f16:
5856   case AArch64::FADDv2f32:
5857   case AArch64::FADDv2f64:
5858   case AArch64::FADDv4f32:
5859   case AArch64::FSUBHrr:
5860   case AArch64::FSUBSrr:
5861   case AArch64::FSUBDrr:
5862   case AArch64::FSUBv4f16:
5863   case AArch64::FSUBv8f16:
5864   case AArch64::FSUBv2f32:
5865   case AArch64::FSUBv2f64:
5866   case AArch64::FSUBv4f32:
5867     TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5868     // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5869     // the target options or if FADD/FSUB has the contract fast-math flag.
5870     return Options.UnsafeFPMath ||
5871            Options.AllowFPOpFusion == FPOpFusion::Fast ||
5872            Inst.getFlag(MachineInstr::FmContract);
5873     return true;
5874   }
5875   return false;
5876 }
5877 
5878 // Opcodes that can be combined with a MUL
5879 static bool isCombineInstrCandidate(unsigned Opc) {
5880   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5881 }
5882 
5883 //
5884 // Utility routine that checks if \param MO is defined by an
5885 // \param CombineOpc instruction in the basic block \param MBB
5886 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5887                        unsigned CombineOpc, unsigned ZeroReg = 0,
5888                        bool CheckZeroReg = false) {
5889   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5890   MachineInstr *MI = nullptr;
5891 
5892   if (MO.isReg() && MO.getReg().isVirtual())
5893     MI = MRI.getUniqueVRegDef(MO.getReg());
5894   // And it needs to be in the trace (otherwise, it won't have a depth).
5895   if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5896     return false;
5897   // Must only used by the user we combine with.
5898   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5899     return false;
5900 
5901   if (CheckZeroReg) {
5902     assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5903            MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5904            MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5905     // The third input reg must be zero.
5906     if (MI->getOperand(3).getReg() != ZeroReg)
5907       return false;
5908   }
5909 
5910   if (isCombineInstrSettingFlag(CombineOpc) &&
5911       MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
5912     return false;
5913 
5914   return true;
5915 }
5916 
5917 //
5918 // Is \param MO defined by an integer multiply and can be combined?
5919 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5920                               unsigned MulOpc, unsigned ZeroReg) {
5921   return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5922 }
5923 
5924 //
5925 // Is \param MO defined by a floating-point multiply and can be combined?
5926 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5927                                unsigned MulOpc) {
5928   return canCombine(MBB, MO, MulOpc);
5929 }
5930 
5931 // TODO: There are many more machine instruction opcodes to match:
5932 //       1. Other data types (integer, vectors)
5933 //       2. Other math / logic operations (xor, or)
5934 //       3. Other forms of the same operation (intrinsics and other variants)
5935 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5936                                                    bool Invert) const {
5937   if (Invert)
5938     return false;
5939   switch (Inst.getOpcode()) {
5940   // == Floating-point types ==
5941   // -- Floating-point instructions --
5942   case AArch64::FADDHrr:
5943   case AArch64::FADDSrr:
5944   case AArch64::FADDDrr:
5945   case AArch64::FMULHrr:
5946   case AArch64::FMULSrr:
5947   case AArch64::FMULDrr:
5948   case AArch64::FMULX16:
5949   case AArch64::FMULX32:
5950   case AArch64::FMULX64:
5951   // -- Advanced SIMD instructions --
5952   case AArch64::FADDv4f16:
5953   case AArch64::FADDv8f16:
5954   case AArch64::FADDv2f32:
5955   case AArch64::FADDv4f32:
5956   case AArch64::FADDv2f64:
5957   case AArch64::FMULv4f16:
5958   case AArch64::FMULv8f16:
5959   case AArch64::FMULv2f32:
5960   case AArch64::FMULv4f32:
5961   case AArch64::FMULv2f64:
5962   case AArch64::FMULXv4f16:
5963   case AArch64::FMULXv8f16:
5964   case AArch64::FMULXv2f32:
5965   case AArch64::FMULXv4f32:
5966   case AArch64::FMULXv2f64:
5967   // -- SVE instructions --
5968   // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
5969   // in the SVE instruction set (though there are predicated ones).
5970   case AArch64::FADD_ZZZ_H:
5971   case AArch64::FADD_ZZZ_S:
5972   case AArch64::FADD_ZZZ_D:
5973   case AArch64::FMUL_ZZZ_H:
5974   case AArch64::FMUL_ZZZ_S:
5975   case AArch64::FMUL_ZZZ_D:
5976     return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
5977            (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
5978             Inst.getFlag(MachineInstr::MIFlag::FmNsz));
5979 
5980   // == Integer types ==
5981   // -- Base instructions --
5982   // Opcodes MULWrr and MULXrr don't exist because
5983   // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
5984   // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
5985   // The machine-combiner does not support three-source-operands machine
5986   // instruction. So we cannot reassociate MULs.
5987   case AArch64::ADDWrr:
5988   case AArch64::ADDXrr:
5989   case AArch64::ANDWrr:
5990   case AArch64::ANDXrr:
5991   case AArch64::ORRWrr:
5992   case AArch64::ORRXrr:
5993   case AArch64::EORWrr:
5994   case AArch64::EORXrr:
5995   case AArch64::EONWrr:
5996   case AArch64::EONXrr:
5997   // -- Advanced SIMD instructions --
5998   // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
5999   // in the Advanced SIMD instruction set.
6000   case AArch64::ADDv8i8:
6001   case AArch64::ADDv16i8:
6002   case AArch64::ADDv4i16:
6003   case AArch64::ADDv8i16:
6004   case AArch64::ADDv2i32:
6005   case AArch64::ADDv4i32:
6006   case AArch64::ADDv1i64:
6007   case AArch64::ADDv2i64:
6008   case AArch64::MULv8i8:
6009   case AArch64::MULv16i8:
6010   case AArch64::MULv4i16:
6011   case AArch64::MULv8i16:
6012   case AArch64::MULv2i32:
6013   case AArch64::MULv4i32:
6014   case AArch64::ANDv8i8:
6015   case AArch64::ANDv16i8:
6016   case AArch64::ORRv8i8:
6017   case AArch64::ORRv16i8:
6018   case AArch64::EORv8i8:
6019   case AArch64::EORv16i8:
6020   // -- SVE instructions --
6021   case AArch64::ADD_ZZZ_B:
6022   case AArch64::ADD_ZZZ_H:
6023   case AArch64::ADD_ZZZ_S:
6024   case AArch64::ADD_ZZZ_D:
6025   case AArch64::MUL_ZZZ_B:
6026   case AArch64::MUL_ZZZ_H:
6027   case AArch64::MUL_ZZZ_S:
6028   case AArch64::MUL_ZZZ_D:
6029   case AArch64::AND_ZZZ:
6030   case AArch64::ORR_ZZZ:
6031   case AArch64::EOR_ZZZ:
6032     return true;
6033 
6034   default:
6035     return false;
6036   }
6037 }
6038 
6039 /// Find instructions that can be turned into madd.
6040 static bool getMaddPatterns(MachineInstr &Root,
6041                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6042   unsigned Opc = Root.getOpcode();
6043   MachineBasicBlock &MBB = *Root.getParent();
6044   bool Found = false;
6045 
6046   if (!isCombineInstrCandidate(Opc))
6047     return false;
6048   if (isCombineInstrSettingFlag(Opc)) {
6049     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
6050     // When NZCV is live bail out.
6051     if (Cmp_NZCV == -1)
6052       return false;
6053     unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6054     // When opcode can't change bail out.
6055     // CHECKME: do we miss any cases for opcode conversion?
6056     if (NewOpc == Opc)
6057       return false;
6058     Opc = NewOpc;
6059   }
6060 
6061   auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6062                       MachineCombinerPattern Pattern) {
6063     if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6064       Patterns.push_back(Pattern);
6065       Found = true;
6066     }
6067   };
6068 
6069   auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
6070     if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6071       Patterns.push_back(Pattern);
6072       Found = true;
6073     }
6074   };
6075 
6076   typedef MachineCombinerPattern MCP;
6077 
6078   switch (Opc) {
6079   default:
6080     break;
6081   case AArch64::ADDWrr:
6082     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6083            "ADDWrr does not have register operands");
6084     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6085     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6086     break;
6087   case AArch64::ADDXrr:
6088     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6089     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6090     break;
6091   case AArch64::SUBWrr:
6092     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6093     setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6094     break;
6095   case AArch64::SUBXrr:
6096     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6097     setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6098     break;
6099   case AArch64::ADDWri:
6100     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6101     break;
6102   case AArch64::ADDXri:
6103     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6104     break;
6105   case AArch64::SUBWri:
6106     setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6107     break;
6108   case AArch64::SUBXri:
6109     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6110     break;
6111   case AArch64::ADDv8i8:
6112     setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6113     setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6114     break;
6115   case AArch64::ADDv16i8:
6116     setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6117     setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6118     break;
6119   case AArch64::ADDv4i16:
6120     setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6121     setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6122     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6123     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6124     break;
6125   case AArch64::ADDv8i16:
6126     setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6127     setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6128     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6129     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6130     break;
6131   case AArch64::ADDv2i32:
6132     setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6133     setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6134     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6135     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6136     break;
6137   case AArch64::ADDv4i32:
6138     setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6139     setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6140     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6141     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6142     break;
6143   case AArch64::SUBv8i8:
6144     setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6145     setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6146     break;
6147   case AArch64::SUBv16i8:
6148     setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6149     setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6150     break;
6151   case AArch64::SUBv4i16:
6152     setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6153     setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6154     setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6155     setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6156     break;
6157   case AArch64::SUBv8i16:
6158     setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6159     setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6160     setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6161     setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6162     break;
6163   case AArch64::SUBv2i32:
6164     setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6165     setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6166     setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6167     setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6168     break;
6169   case AArch64::SUBv4i32:
6170     setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6171     setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6172     setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6173     setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6174     break;
6175   }
6176   return Found;
6177 }
6178 /// Floating-Point Support
6179 
6180 /// Find instructions that can be turned into madd.
6181 static bool getFMAPatterns(MachineInstr &Root,
6182                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6183 
6184   if (!isCombineInstrCandidateFP(Root))
6185     return false;
6186 
6187   MachineBasicBlock &MBB = *Root.getParent();
6188   bool Found = false;
6189 
6190   auto Match = [&](int Opcode, int Operand,
6191                    MachineCombinerPattern Pattern) -> bool {
6192     if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6193       Patterns.push_back(Pattern);
6194       return true;
6195     }
6196     return false;
6197   };
6198 
6199   typedef MachineCombinerPattern MCP;
6200 
6201   switch (Root.getOpcode()) {
6202   default:
6203     assert(false && "Unsupported FP instruction in combiner\n");
6204     break;
6205   case AArch64::FADDHrr:
6206     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6207            "FADDHrr does not have register operands");
6208 
6209     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6210     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6211     break;
6212   case AArch64::FADDSrr:
6213     assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6214            "FADDSrr does not have register operands");
6215 
6216     Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6217              Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6218 
6219     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6220              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6221     break;
6222   case AArch64::FADDDrr:
6223     Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6224              Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6225 
6226     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6227              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6228     break;
6229   case AArch64::FADDv4f16:
6230     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6231              Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6232 
6233     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6234              Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6235     break;
6236   case AArch64::FADDv8f16:
6237     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6238              Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6239 
6240     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6241              Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6242     break;
6243   case AArch64::FADDv2f32:
6244     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6245              Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6246 
6247     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6248              Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6249     break;
6250   case AArch64::FADDv2f64:
6251     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6252              Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6253 
6254     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6255              Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6256     break;
6257   case AArch64::FADDv4f32:
6258     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6259              Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6260 
6261     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6262              Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6263     break;
6264   case AArch64::FSUBHrr:
6265     Found  = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6266     Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6267     Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6268     break;
6269   case AArch64::FSUBSrr:
6270     Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6271 
6272     Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6273              Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6274 
6275     Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6276     break;
6277   case AArch64::FSUBDrr:
6278     Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6279 
6280     Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6281              Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6282 
6283     Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6284     break;
6285   case AArch64::FSUBv4f16:
6286     Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6287              Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6288 
6289     Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6290              Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6291     break;
6292   case AArch64::FSUBv8f16:
6293     Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6294              Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6295 
6296     Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6297              Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6298     break;
6299   case AArch64::FSUBv2f32:
6300     Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6301              Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6302 
6303     Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6304              Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6305     break;
6306   case AArch64::FSUBv2f64:
6307     Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6308              Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6309 
6310     Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6311              Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6312     break;
6313   case AArch64::FSUBv4f32:
6314     Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6315              Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6316 
6317     Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6318              Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6319     break;
6320   }
6321   return Found;
6322 }
6323 
6324 static bool getFMULPatterns(MachineInstr &Root,
6325                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6326   MachineBasicBlock &MBB = *Root.getParent();
6327   bool Found = false;
6328 
6329   auto Match = [&](unsigned Opcode, int Operand,
6330                    MachineCombinerPattern Pattern) -> bool {
6331     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6332     MachineOperand &MO = Root.getOperand(Operand);
6333     MachineInstr *MI = nullptr;
6334     if (MO.isReg() && MO.getReg().isVirtual())
6335       MI = MRI.getUniqueVRegDef(MO.getReg());
6336     // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6337     if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6338         MI->getOperand(1).getReg().isVirtual())
6339       MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6340     if (MI && MI->getOpcode() == Opcode) {
6341       Patterns.push_back(Pattern);
6342       return true;
6343     }
6344     return false;
6345   };
6346 
6347   typedef MachineCombinerPattern MCP;
6348 
6349   switch (Root.getOpcode()) {
6350   default:
6351     return false;
6352   case AArch64::FMULv2f32:
6353     Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6354     Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6355     break;
6356   case AArch64::FMULv2f64:
6357     Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6358     Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6359     break;
6360   case AArch64::FMULv4f16:
6361     Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6362     Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6363     break;
6364   case AArch64::FMULv4f32:
6365     Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6366     Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6367     break;
6368   case AArch64::FMULv8f16:
6369     Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6370     Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6371     break;
6372   }
6373 
6374   return Found;
6375 }
6376 
6377 static bool getFNEGPatterns(MachineInstr &Root,
6378                             SmallVectorImpl<MachineCombinerPattern> &Patterns) {
6379   unsigned Opc = Root.getOpcode();
6380   MachineBasicBlock &MBB = *Root.getParent();
6381   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6382 
6383   auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool {
6384     MachineOperand &MO = Root.getOperand(1);
6385     MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6386     if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6387         MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6388         Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6389         Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6390         MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6391         MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6392       Patterns.push_back(Pattern);
6393       return true;
6394     }
6395     return false;
6396   };
6397 
6398   switch (Opc) {
6399   default:
6400     break;
6401   case AArch64::FNEGDr:
6402     return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD);
6403   case AArch64::FNEGSr:
6404     return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD);
6405   }
6406 
6407   return false;
6408 }
6409 
6410 /// Return true when a code sequence can improve throughput. It
6411 /// should be called only for instructions in loops.
6412 /// \param Pattern - combiner pattern
6413 bool AArch64InstrInfo::isThroughputPattern(
6414     MachineCombinerPattern Pattern) const {
6415   switch (Pattern) {
6416   default:
6417     break;
6418   case MachineCombinerPattern::FMULADDH_OP1:
6419   case MachineCombinerPattern::FMULADDH_OP2:
6420   case MachineCombinerPattern::FMULSUBH_OP1:
6421   case MachineCombinerPattern::FMULSUBH_OP2:
6422   case MachineCombinerPattern::FMULADDS_OP1:
6423   case MachineCombinerPattern::FMULADDS_OP2:
6424   case MachineCombinerPattern::FMULSUBS_OP1:
6425   case MachineCombinerPattern::FMULSUBS_OP2:
6426   case MachineCombinerPattern::FMULADDD_OP1:
6427   case MachineCombinerPattern::FMULADDD_OP2:
6428   case MachineCombinerPattern::FMULSUBD_OP1:
6429   case MachineCombinerPattern::FMULSUBD_OP2:
6430   case MachineCombinerPattern::FNMULSUBH_OP1:
6431   case MachineCombinerPattern::FNMULSUBS_OP1:
6432   case MachineCombinerPattern::FNMULSUBD_OP1:
6433   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6434   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6435   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6436   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6437   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6438   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6439   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6440   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6441   case MachineCombinerPattern::FMLAv4f16_OP2:
6442   case MachineCombinerPattern::FMLAv4f16_OP1:
6443   case MachineCombinerPattern::FMLAv8f16_OP1:
6444   case MachineCombinerPattern::FMLAv8f16_OP2:
6445   case MachineCombinerPattern::FMLAv2f32_OP2:
6446   case MachineCombinerPattern::FMLAv2f32_OP1:
6447   case MachineCombinerPattern::FMLAv2f64_OP1:
6448   case MachineCombinerPattern::FMLAv2f64_OP2:
6449   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6450   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6451   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6452   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6453   case MachineCombinerPattern::FMLAv4f32_OP1:
6454   case MachineCombinerPattern::FMLAv4f32_OP2:
6455   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6456   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6457   case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6458   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6459   case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6460   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6461   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6462   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6463   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6464   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6465   case MachineCombinerPattern::FMLSv4f16_OP1:
6466   case MachineCombinerPattern::FMLSv4f16_OP2:
6467   case MachineCombinerPattern::FMLSv8f16_OP1:
6468   case MachineCombinerPattern::FMLSv8f16_OP2:
6469   case MachineCombinerPattern::FMLSv2f32_OP2:
6470   case MachineCombinerPattern::FMLSv2f64_OP2:
6471   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6472   case MachineCombinerPattern::FMLSv4f32_OP2:
6473   case MachineCombinerPattern::FMULv2i32_indexed_OP1:
6474   case MachineCombinerPattern::FMULv2i32_indexed_OP2:
6475   case MachineCombinerPattern::FMULv2i64_indexed_OP1:
6476   case MachineCombinerPattern::FMULv2i64_indexed_OP2:
6477   case MachineCombinerPattern::FMULv4i16_indexed_OP1:
6478   case MachineCombinerPattern::FMULv4i16_indexed_OP2:
6479   case MachineCombinerPattern::FMULv4i32_indexed_OP1:
6480   case MachineCombinerPattern::FMULv4i32_indexed_OP2:
6481   case MachineCombinerPattern::FMULv8i16_indexed_OP1:
6482   case MachineCombinerPattern::FMULv8i16_indexed_OP2:
6483   case MachineCombinerPattern::MULADDv8i8_OP1:
6484   case MachineCombinerPattern::MULADDv8i8_OP2:
6485   case MachineCombinerPattern::MULADDv16i8_OP1:
6486   case MachineCombinerPattern::MULADDv16i8_OP2:
6487   case MachineCombinerPattern::MULADDv4i16_OP1:
6488   case MachineCombinerPattern::MULADDv4i16_OP2:
6489   case MachineCombinerPattern::MULADDv8i16_OP1:
6490   case MachineCombinerPattern::MULADDv8i16_OP2:
6491   case MachineCombinerPattern::MULADDv2i32_OP1:
6492   case MachineCombinerPattern::MULADDv2i32_OP2:
6493   case MachineCombinerPattern::MULADDv4i32_OP1:
6494   case MachineCombinerPattern::MULADDv4i32_OP2:
6495   case MachineCombinerPattern::MULSUBv8i8_OP1:
6496   case MachineCombinerPattern::MULSUBv8i8_OP2:
6497   case MachineCombinerPattern::MULSUBv16i8_OP1:
6498   case MachineCombinerPattern::MULSUBv16i8_OP2:
6499   case MachineCombinerPattern::MULSUBv4i16_OP1:
6500   case MachineCombinerPattern::MULSUBv4i16_OP2:
6501   case MachineCombinerPattern::MULSUBv8i16_OP1:
6502   case MachineCombinerPattern::MULSUBv8i16_OP2:
6503   case MachineCombinerPattern::MULSUBv2i32_OP1:
6504   case MachineCombinerPattern::MULSUBv2i32_OP2:
6505   case MachineCombinerPattern::MULSUBv4i32_OP1:
6506   case MachineCombinerPattern::MULSUBv4i32_OP2:
6507   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6508   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6509   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6510   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6511   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6512   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6513   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6514   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6515   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6516   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6517   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6518   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6519   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6520   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6521   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6522   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6523     return true;
6524   } // end switch (Pattern)
6525   return false;
6526 }
6527 
6528 /// Find other MI combine patterns.
6529 static bool getMiscPatterns(MachineInstr &Root,
6530                             SmallVectorImpl<MachineCombinerPattern> &Patterns)
6531 {
6532   // A - (B + C)  ==>   (A - B) - C  or  (A - C) - B
6533   unsigned Opc = Root.getOpcode();
6534   MachineBasicBlock &MBB = *Root.getParent();
6535 
6536   switch (Opc) {
6537   case AArch64::SUBWrr:
6538   case AArch64::SUBSWrr:
6539   case AArch64::SUBXrr:
6540   case AArch64::SUBSXrr:
6541     // Found candidate root.
6542     break;
6543   default:
6544     return false;
6545   }
6546 
6547   if (isCombineInstrSettingFlag(Opc) &&
6548       Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
6549     return false;
6550 
6551   if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6552       canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6553       canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6554       canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6555     Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
6556     Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
6557     return true;
6558   }
6559 
6560   return false;
6561 }
6562 
6563 /// Return true when there is potentially a faster code sequence for an
6564 /// instruction chain ending in \p Root. All potential patterns are listed in
6565 /// the \p Pattern vector. Pattern should be sorted in priority order since the
6566 /// pattern evaluator stops checking as soon as it finds a faster sequence.
6567 
6568 bool AArch64InstrInfo::getMachineCombinerPatterns(
6569     MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
6570     bool DoRegPressureReduce) const {
6571   // Integer patterns
6572   if (getMaddPatterns(Root, Patterns))
6573     return true;
6574   // Floating point patterns
6575   if (getFMULPatterns(Root, Patterns))
6576     return true;
6577   if (getFMAPatterns(Root, Patterns))
6578     return true;
6579   if (getFNEGPatterns(Root, Patterns))
6580     return true;
6581 
6582   // Other patterns
6583   if (getMiscPatterns(Root, Patterns))
6584     return true;
6585 
6586   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6587                                                      DoRegPressureReduce);
6588 }
6589 
6590 enum class FMAInstKind { Default, Indexed, Accumulator };
6591 /// genFusedMultiply - Generate fused multiply instructions.
6592 /// This function supports both integer and floating point instructions.
6593 /// A typical example:
6594 ///  F|MUL I=A,B,0
6595 ///  F|ADD R,I,C
6596 ///  ==> F|MADD R,A,B,C
6597 /// \param MF Containing MachineFunction
6598 /// \param MRI Register information
6599 /// \param TII Target information
6600 /// \param Root is the F|ADD instruction
6601 /// \param [out] InsInstrs is a vector of machine instructions and will
6602 /// contain the generated madd instruction
6603 /// \param IdxMulOpd is index of operand in Root that is the result of
6604 /// the F|MUL. In the example above IdxMulOpd is 1.
6605 /// \param MaddOpc the opcode fo the f|madd instruction
6606 /// \param RC Register class of operands
6607 /// \param kind of fma instruction (addressing mode) to be generated
6608 /// \param ReplacedAddend is the result register from the instruction
6609 /// replacing the non-combined operand, if any.
6610 static MachineInstr *
6611 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6612                  const TargetInstrInfo *TII, MachineInstr &Root,
6613                  SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6614                  unsigned MaddOpc, const TargetRegisterClass *RC,
6615                  FMAInstKind kind = FMAInstKind::Default,
6616                  const Register *ReplacedAddend = nullptr) {
6617   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6618 
6619   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6620   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6621   Register ResultReg = Root.getOperand(0).getReg();
6622   Register SrcReg0 = MUL->getOperand(1).getReg();
6623   bool Src0IsKill = MUL->getOperand(1).isKill();
6624   Register SrcReg1 = MUL->getOperand(2).getReg();
6625   bool Src1IsKill = MUL->getOperand(2).isKill();
6626 
6627   Register SrcReg2;
6628   bool Src2IsKill;
6629   if (ReplacedAddend) {
6630     // If we just generated a new addend, we must be it's only use.
6631     SrcReg2 = *ReplacedAddend;
6632     Src2IsKill = true;
6633   } else {
6634     SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6635     Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6636   }
6637 
6638   if (ResultReg.isVirtual())
6639     MRI.constrainRegClass(ResultReg, RC);
6640   if (SrcReg0.isVirtual())
6641     MRI.constrainRegClass(SrcReg0, RC);
6642   if (SrcReg1.isVirtual())
6643     MRI.constrainRegClass(SrcReg1, RC);
6644   if (SrcReg2.isVirtual())
6645     MRI.constrainRegClass(SrcReg2, RC);
6646 
6647   MachineInstrBuilder MIB;
6648   if (kind == FMAInstKind::Default)
6649     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6650               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6651               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6652               .addReg(SrcReg2, getKillRegState(Src2IsKill));
6653   else if (kind == FMAInstKind::Indexed)
6654     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6655               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6656               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6657               .addReg(SrcReg1, getKillRegState(Src1IsKill))
6658               .addImm(MUL->getOperand(3).getImm());
6659   else if (kind == FMAInstKind::Accumulator)
6660     MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6661               .addReg(SrcReg2, getKillRegState(Src2IsKill))
6662               .addReg(SrcReg0, getKillRegState(Src0IsKill))
6663               .addReg(SrcReg1, getKillRegState(Src1IsKill));
6664   else
6665     assert(false && "Invalid FMA instruction kind \n");
6666   // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6667   InsInstrs.push_back(MIB);
6668   return MUL;
6669 }
6670 
6671 static MachineInstr *
6672 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6673                const TargetInstrInfo *TII, MachineInstr &Root,
6674                SmallVectorImpl<MachineInstr *> &InsInstrs) {
6675   MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6676 
6677   unsigned Opc = 0;
6678   const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6679   if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6680     Opc = AArch64::FNMADDSrrr;
6681   else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6682     Opc = AArch64::FNMADDDrrr;
6683   else
6684     return nullptr;
6685 
6686   Register ResultReg = Root.getOperand(0).getReg();
6687   Register SrcReg0 = MAD->getOperand(1).getReg();
6688   Register SrcReg1 = MAD->getOperand(2).getReg();
6689   Register SrcReg2 = MAD->getOperand(3).getReg();
6690   bool Src0IsKill = MAD->getOperand(1).isKill();
6691   bool Src1IsKill = MAD->getOperand(2).isKill();
6692   bool Src2IsKill = MAD->getOperand(3).isKill();
6693   if (ResultReg.isVirtual())
6694     MRI.constrainRegClass(ResultReg, RC);
6695   if (SrcReg0.isVirtual())
6696     MRI.constrainRegClass(SrcReg0, RC);
6697   if (SrcReg1.isVirtual())
6698     MRI.constrainRegClass(SrcReg1, RC);
6699   if (SrcReg2.isVirtual())
6700     MRI.constrainRegClass(SrcReg2, RC);
6701 
6702   MachineInstrBuilder MIB =
6703       BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6704           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6705           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6706           .addReg(SrcReg2, getKillRegState(Src2IsKill));
6707   InsInstrs.push_back(MIB);
6708 
6709   return MAD;
6710 }
6711 
6712 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6713 static MachineInstr *
6714 genIndexedMultiply(MachineInstr &Root,
6715                    SmallVectorImpl<MachineInstr *> &InsInstrs,
6716                    unsigned IdxDupOp, unsigned MulOpc,
6717                    const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6718   assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6719          "Invalid index of FMUL operand");
6720 
6721   MachineFunction &MF = *Root.getMF();
6722   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6723 
6724   MachineInstr *Dup =
6725       MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6726 
6727   if (Dup->getOpcode() == TargetOpcode::COPY)
6728     Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6729 
6730   Register DupSrcReg = Dup->getOperand(1).getReg();
6731   MRI.clearKillFlags(DupSrcReg);
6732   MRI.constrainRegClass(DupSrcReg, RC);
6733 
6734   unsigned DupSrcLane = Dup->getOperand(2).getImm();
6735 
6736   unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6737   MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6738 
6739   Register ResultReg = Root.getOperand(0).getReg();
6740 
6741   MachineInstrBuilder MIB;
6742   MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6743             .add(MulOp)
6744             .addReg(DupSrcReg)
6745             .addImm(DupSrcLane);
6746 
6747   InsInstrs.push_back(MIB);
6748   return &Root;
6749 }
6750 
6751 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6752 /// instructions.
6753 ///
6754 /// \see genFusedMultiply
6755 static MachineInstr *genFusedMultiplyAcc(
6756     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6757     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6758     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6759   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6760                           FMAInstKind::Accumulator);
6761 }
6762 
6763 /// genNeg - Helper to generate an intermediate negation of the second operand
6764 /// of Root
6765 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6766                        const TargetInstrInfo *TII, MachineInstr &Root,
6767                        SmallVectorImpl<MachineInstr *> &InsInstrs,
6768                        DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6769                        unsigned MnegOpc, const TargetRegisterClass *RC) {
6770   Register NewVR = MRI.createVirtualRegister(RC);
6771   MachineInstrBuilder MIB =
6772       BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6773           .add(Root.getOperand(2));
6774   InsInstrs.push_back(MIB);
6775 
6776   assert(InstrIdxForVirtReg.empty());
6777   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6778 
6779   return NewVR;
6780 }
6781 
6782 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6783 /// instructions with an additional negation of the accumulator
6784 static MachineInstr *genFusedMultiplyAccNeg(
6785     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6786     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6787     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6788     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6789   assert(IdxMulOpd == 1);
6790 
6791   Register NewVR =
6792       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6793   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6794                           FMAInstKind::Accumulator, &NewVR);
6795 }
6796 
6797 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6798 /// instructions.
6799 ///
6800 /// \see genFusedMultiply
6801 static MachineInstr *genFusedMultiplyIdx(
6802     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804     unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806                           FMAInstKind::Indexed);
6807 }
6808 
6809 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6810 /// instructions with an additional negation of the accumulator
6811 static MachineInstr *genFusedMultiplyIdxNeg(
6812     MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6813     MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6814     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6815     unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6816   assert(IdxMulOpd == 1);
6817 
6818   Register NewVR =
6819       genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6820 
6821   return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6822                           FMAInstKind::Indexed, &NewVR);
6823 }
6824 
6825 /// genMaddR - Generate madd instruction and combine mul and add using
6826 /// an extra virtual register
6827 /// Example - an ADD intermediate needs to be stored in a register:
6828 ///   MUL I=A,B,0
6829 ///   ADD R,I,Imm
6830 ///   ==> ORR  V, ZR, Imm
6831 ///   ==> MADD R,A,B,V
6832 /// \param MF Containing MachineFunction
6833 /// \param MRI Register information
6834 /// \param TII Target information
6835 /// \param Root is the ADD instruction
6836 /// \param [out] InsInstrs is a vector of machine instructions and will
6837 /// contain the generated madd instruction
6838 /// \param IdxMulOpd is index of operand in Root that is the result of
6839 /// the MUL. In the example above IdxMulOpd is 1.
6840 /// \param MaddOpc the opcode fo the madd instruction
6841 /// \param VR is a virtual register that holds the value of an ADD operand
6842 /// (V in the example above).
6843 /// \param RC Register class of operands
6844 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6845                               const TargetInstrInfo *TII, MachineInstr &Root,
6846                               SmallVectorImpl<MachineInstr *> &InsInstrs,
6847                               unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6848                               const TargetRegisterClass *RC) {
6849   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6850 
6851   MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6852   Register ResultReg = Root.getOperand(0).getReg();
6853   Register SrcReg0 = MUL->getOperand(1).getReg();
6854   bool Src0IsKill = MUL->getOperand(1).isKill();
6855   Register SrcReg1 = MUL->getOperand(2).getReg();
6856   bool Src1IsKill = MUL->getOperand(2).isKill();
6857 
6858   if (ResultReg.isVirtual())
6859     MRI.constrainRegClass(ResultReg, RC);
6860   if (SrcReg0.isVirtual())
6861     MRI.constrainRegClass(SrcReg0, RC);
6862   if (SrcReg1.isVirtual())
6863     MRI.constrainRegClass(SrcReg1, RC);
6864   if (Register::isVirtualRegister(VR))
6865     MRI.constrainRegClass(VR, RC);
6866 
6867   MachineInstrBuilder MIB =
6868       BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6869           .addReg(SrcReg0, getKillRegState(Src0IsKill))
6870           .addReg(SrcReg1, getKillRegState(Src1IsKill))
6871           .addReg(VR);
6872   // Insert the MADD
6873   InsInstrs.push_back(MIB);
6874   return MUL;
6875 }
6876 
6877 /// Do the following transformation
6878 /// A - (B + C)  ==>   (A - B) - C
6879 /// A - (B + C)  ==>   (A - C) - B
6880 static void
6881 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6882                  const TargetInstrInfo *TII, MachineInstr &Root,
6883                  SmallVectorImpl<MachineInstr *> &InsInstrs,
6884                  SmallVectorImpl<MachineInstr *> &DelInstrs,
6885                  unsigned IdxOpd1,
6886                  DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6887   assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6888   unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6889   MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6890 
6891   Register ResultReg = Root.getOperand(0).getReg();
6892   Register RegA = Root.getOperand(1).getReg();
6893   bool RegAIsKill = Root.getOperand(1).isKill();
6894   Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6895   bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6896   Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6897   bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6898   Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6899 
6900   unsigned Opcode = Root.getOpcode();
6901   if (Opcode == AArch64::SUBSWrr)
6902     Opcode = AArch64::SUBWrr;
6903   else if (Opcode == AArch64::SUBSXrr)
6904     Opcode = AArch64::SUBXrr;
6905   else
6906     assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6907            "Unexpected instruction opcode.");
6908 
6909   MachineInstrBuilder MIB1 =
6910       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6911           .addReg(RegA, getKillRegState(RegAIsKill))
6912           .addReg(RegB, getKillRegState(RegBIsKill));
6913   MachineInstrBuilder MIB2 =
6914       BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6915           .addReg(NewVR, getKillRegState(true))
6916           .addReg(RegC, getKillRegState(RegCIsKill));
6917 
6918   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6919   InsInstrs.push_back(MIB1);
6920   InsInstrs.push_back(MIB2);
6921   DelInstrs.push_back(AddMI);
6922 }
6923 
6924 /// When getMachineCombinerPatterns() finds potential patterns,
6925 /// this function generates the instructions that could replace the
6926 /// original code sequence
6927 void AArch64InstrInfo::genAlternativeCodeSequence(
6928     MachineInstr &Root, MachineCombinerPattern Pattern,
6929     SmallVectorImpl<MachineInstr *> &InsInstrs,
6930     SmallVectorImpl<MachineInstr *> &DelInstrs,
6931     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6932   MachineBasicBlock &MBB = *Root.getParent();
6933   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6934   MachineFunction &MF = *MBB.getParent();
6935   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6936 
6937   MachineInstr *MUL = nullptr;
6938   const TargetRegisterClass *RC;
6939   unsigned Opc;
6940   switch (Pattern) {
6941   default:
6942     // Reassociate instructions.
6943     TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6944                                                 DelInstrs, InstrIdxForVirtReg);
6945     return;
6946   case MachineCombinerPattern::SUBADD_OP1:
6947     // A - (B + C)
6948     // ==> (A - B) - C
6949     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
6950                      InstrIdxForVirtReg);
6951     break;
6952   case MachineCombinerPattern::SUBADD_OP2:
6953     // A - (B + C)
6954     // ==> (A - C) - B
6955     genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
6956                      InstrIdxForVirtReg);
6957     break;
6958   case MachineCombinerPattern::MULADDW_OP1:
6959   case MachineCombinerPattern::MULADDX_OP1:
6960     // MUL I=A,B,0
6961     // ADD R,I,C
6962     // ==> MADD R,A,B,C
6963     // --- Create(MADD);
6964     if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
6965       Opc = AArch64::MADDWrrr;
6966       RC = &AArch64::GPR32RegClass;
6967     } else {
6968       Opc = AArch64::MADDXrrr;
6969       RC = &AArch64::GPR64RegClass;
6970     }
6971     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
6972     break;
6973   case MachineCombinerPattern::MULADDW_OP2:
6974   case MachineCombinerPattern::MULADDX_OP2:
6975     // MUL I=A,B,0
6976     // ADD R,C,I
6977     // ==> MADD R,A,B,C
6978     // --- Create(MADD);
6979     if (Pattern == MachineCombinerPattern::MULADDW_OP2) {
6980       Opc = AArch64::MADDWrrr;
6981       RC = &AArch64::GPR32RegClass;
6982     } else {
6983       Opc = AArch64::MADDXrrr;
6984       RC = &AArch64::GPR64RegClass;
6985     }
6986     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
6987     break;
6988   case MachineCombinerPattern::MULADDWI_OP1:
6989   case MachineCombinerPattern::MULADDXI_OP1: {
6990     // MUL I=A,B,0
6991     // ADD R,I,Imm
6992     // ==> MOV V, Imm
6993     // ==> MADD R,A,B,V
6994     // --- Create(MADD);
6995     const TargetRegisterClass *OrrRC;
6996     unsigned BitSize, OrrOpc, ZeroReg;
6997     if (Pattern == MachineCombinerPattern::MULADDWI_OP1) {
6998       OrrOpc = AArch64::ORRWri;
6999       OrrRC = &AArch64::GPR32spRegClass;
7000       BitSize = 32;
7001       ZeroReg = AArch64::WZR;
7002       Opc = AArch64::MADDWrrr;
7003       RC = &AArch64::GPR32RegClass;
7004     } else {
7005       OrrOpc = AArch64::ORRXri;
7006       OrrRC = &AArch64::GPR64spRegClass;
7007       BitSize = 64;
7008       ZeroReg = AArch64::XZR;
7009       Opc = AArch64::MADDXrrr;
7010       RC = &AArch64::GPR64RegClass;
7011     }
7012     Register NewVR = MRI.createVirtualRegister(OrrRC);
7013     uint64_t Imm = Root.getOperand(2).getImm();
7014 
7015     if (Root.getOperand(3).isImm()) {
7016       unsigned Val = Root.getOperand(3).getImm();
7017       Imm = Imm << Val;
7018     }
7019     uint64_t UImm = SignExtend64(Imm, BitSize);
7020     // The immediate can be composed via a single instruction.
7021     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7022     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7023     if (Insn.size() != 1)
7024       return;
7025     auto MovI = Insn.begin();
7026     MachineInstrBuilder MIB1;
7027     // MOV is an alias for one of three instructions: movz, movn, and orr.
7028     if (MovI->Opcode == OrrOpc)
7029       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7030                  .addReg(ZeroReg)
7031                  .addImm(MovI->Op2);
7032     else {
7033       if (BitSize == 32)
7034         assert((MovI->Opcode == AArch64::MOVNWi ||
7035                 MovI->Opcode == AArch64::MOVZWi) &&
7036                "Expected opcode");
7037       else
7038         assert((MovI->Opcode == AArch64::MOVNXi ||
7039                 MovI->Opcode == AArch64::MOVZXi) &&
7040                "Expected opcode");
7041       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7042                  .addImm(MovI->Op1)
7043                  .addImm(MovI->Op2);
7044     }
7045     InsInstrs.push_back(MIB1);
7046     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7047     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7048     break;
7049   }
7050   case MachineCombinerPattern::MULSUBW_OP1:
7051   case MachineCombinerPattern::MULSUBX_OP1: {
7052     // MUL I=A,B,0
7053     // SUB R,I, C
7054     // ==> SUB  V, 0, C
7055     // ==> MADD R,A,B,V // = -C + A*B
7056     // --- Create(MADD);
7057     const TargetRegisterClass *SubRC;
7058     unsigned SubOpc, ZeroReg;
7059     if (Pattern == MachineCombinerPattern::MULSUBW_OP1) {
7060       SubOpc = AArch64::SUBWrr;
7061       SubRC = &AArch64::GPR32spRegClass;
7062       ZeroReg = AArch64::WZR;
7063       Opc = AArch64::MADDWrrr;
7064       RC = &AArch64::GPR32RegClass;
7065     } else {
7066       SubOpc = AArch64::SUBXrr;
7067       SubRC = &AArch64::GPR64spRegClass;
7068       ZeroReg = AArch64::XZR;
7069       Opc = AArch64::MADDXrrr;
7070       RC = &AArch64::GPR64RegClass;
7071     }
7072     Register NewVR = MRI.createVirtualRegister(SubRC);
7073     // SUB NewVR, 0, C
7074     MachineInstrBuilder MIB1 =
7075         BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7076             .addReg(ZeroReg)
7077             .add(Root.getOperand(2));
7078     InsInstrs.push_back(MIB1);
7079     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7080     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7081     break;
7082   }
7083   case MachineCombinerPattern::MULSUBW_OP2:
7084   case MachineCombinerPattern::MULSUBX_OP2:
7085     // MUL I=A,B,0
7086     // SUB R,C,I
7087     // ==> MSUB R,A,B,C (computes C - A*B)
7088     // --- Create(MSUB);
7089     if (Pattern == MachineCombinerPattern::MULSUBW_OP2) {
7090       Opc = AArch64::MSUBWrrr;
7091       RC = &AArch64::GPR32RegClass;
7092     } else {
7093       Opc = AArch64::MSUBXrrr;
7094       RC = &AArch64::GPR64RegClass;
7095     }
7096     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7097     break;
7098   case MachineCombinerPattern::MULSUBWI_OP1:
7099   case MachineCombinerPattern::MULSUBXI_OP1: {
7100     // MUL I=A,B,0
7101     // SUB R,I, Imm
7102     // ==> MOV  V, -Imm
7103     // ==> MADD R,A,B,V // = -Imm + A*B
7104     // --- Create(MADD);
7105     const TargetRegisterClass *OrrRC;
7106     unsigned BitSize, OrrOpc, ZeroReg;
7107     if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) {
7108       OrrOpc = AArch64::ORRWri;
7109       OrrRC = &AArch64::GPR32spRegClass;
7110       BitSize = 32;
7111       ZeroReg = AArch64::WZR;
7112       Opc = AArch64::MADDWrrr;
7113       RC = &AArch64::GPR32RegClass;
7114     } else {
7115       OrrOpc = AArch64::ORRXri;
7116       OrrRC = &AArch64::GPR64spRegClass;
7117       BitSize = 64;
7118       ZeroReg = AArch64::XZR;
7119       Opc = AArch64::MADDXrrr;
7120       RC = &AArch64::GPR64RegClass;
7121     }
7122     Register NewVR = MRI.createVirtualRegister(OrrRC);
7123     uint64_t Imm = Root.getOperand(2).getImm();
7124     if (Root.getOperand(3).isImm()) {
7125       unsigned Val = Root.getOperand(3).getImm();
7126       Imm = Imm << Val;
7127     }
7128     uint64_t UImm = SignExtend64(-Imm, BitSize);
7129     // The immediate can be composed via a single instruction.
7130     SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7131     AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7132     if (Insn.size() != 1)
7133       return;
7134     auto MovI = Insn.begin();
7135     MachineInstrBuilder MIB1;
7136     // MOV is an alias for one of three instructions: movz, movn, and orr.
7137     if (MovI->Opcode == OrrOpc)
7138       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7139                  .addReg(ZeroReg)
7140                  .addImm(MovI->Op2);
7141     else {
7142       if (BitSize == 32)
7143         assert((MovI->Opcode == AArch64::MOVNWi ||
7144                 MovI->Opcode == AArch64::MOVZWi) &&
7145                "Expected opcode");
7146       else
7147         assert((MovI->Opcode == AArch64::MOVNXi ||
7148                 MovI->Opcode == AArch64::MOVZXi) &&
7149                "Expected opcode");
7150       MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7151                  .addImm(MovI->Op1)
7152                  .addImm(MovI->Op2);
7153     }
7154     InsInstrs.push_back(MIB1);
7155     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7156     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7157     break;
7158   }
7159 
7160   case MachineCombinerPattern::MULADDv8i8_OP1:
7161     Opc = AArch64::MLAv8i8;
7162     RC = &AArch64::FPR64RegClass;
7163     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7164     break;
7165   case MachineCombinerPattern::MULADDv8i8_OP2:
7166     Opc = AArch64::MLAv8i8;
7167     RC = &AArch64::FPR64RegClass;
7168     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7169     break;
7170   case MachineCombinerPattern::MULADDv16i8_OP1:
7171     Opc = AArch64::MLAv16i8;
7172     RC = &AArch64::FPR128RegClass;
7173     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7174     break;
7175   case MachineCombinerPattern::MULADDv16i8_OP2:
7176     Opc = AArch64::MLAv16i8;
7177     RC = &AArch64::FPR128RegClass;
7178     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7179     break;
7180   case MachineCombinerPattern::MULADDv4i16_OP1:
7181     Opc = AArch64::MLAv4i16;
7182     RC = &AArch64::FPR64RegClass;
7183     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7184     break;
7185   case MachineCombinerPattern::MULADDv4i16_OP2:
7186     Opc = AArch64::MLAv4i16;
7187     RC = &AArch64::FPR64RegClass;
7188     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7189     break;
7190   case MachineCombinerPattern::MULADDv8i16_OP1:
7191     Opc = AArch64::MLAv8i16;
7192     RC = &AArch64::FPR128RegClass;
7193     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7194     break;
7195   case MachineCombinerPattern::MULADDv8i16_OP2:
7196     Opc = AArch64::MLAv8i16;
7197     RC = &AArch64::FPR128RegClass;
7198     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7199     break;
7200   case MachineCombinerPattern::MULADDv2i32_OP1:
7201     Opc = AArch64::MLAv2i32;
7202     RC = &AArch64::FPR64RegClass;
7203     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7204     break;
7205   case MachineCombinerPattern::MULADDv2i32_OP2:
7206     Opc = AArch64::MLAv2i32;
7207     RC = &AArch64::FPR64RegClass;
7208     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7209     break;
7210   case MachineCombinerPattern::MULADDv4i32_OP1:
7211     Opc = AArch64::MLAv4i32;
7212     RC = &AArch64::FPR128RegClass;
7213     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7214     break;
7215   case MachineCombinerPattern::MULADDv4i32_OP2:
7216     Opc = AArch64::MLAv4i32;
7217     RC = &AArch64::FPR128RegClass;
7218     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7219     break;
7220 
7221   case MachineCombinerPattern::MULSUBv8i8_OP1:
7222     Opc = AArch64::MLAv8i8;
7223     RC = &AArch64::FPR64RegClass;
7224     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7225                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7226                                  RC);
7227     break;
7228   case MachineCombinerPattern::MULSUBv8i8_OP2:
7229     Opc = AArch64::MLSv8i8;
7230     RC = &AArch64::FPR64RegClass;
7231     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7232     break;
7233   case MachineCombinerPattern::MULSUBv16i8_OP1:
7234     Opc = AArch64::MLAv16i8;
7235     RC = &AArch64::FPR128RegClass;
7236     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7237                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7238                                  RC);
7239     break;
7240   case MachineCombinerPattern::MULSUBv16i8_OP2:
7241     Opc = AArch64::MLSv16i8;
7242     RC = &AArch64::FPR128RegClass;
7243     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7244     break;
7245   case MachineCombinerPattern::MULSUBv4i16_OP1:
7246     Opc = AArch64::MLAv4i16;
7247     RC = &AArch64::FPR64RegClass;
7248     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7249                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7250                                  RC);
7251     break;
7252   case MachineCombinerPattern::MULSUBv4i16_OP2:
7253     Opc = AArch64::MLSv4i16;
7254     RC = &AArch64::FPR64RegClass;
7255     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7256     break;
7257   case MachineCombinerPattern::MULSUBv8i16_OP1:
7258     Opc = AArch64::MLAv8i16;
7259     RC = &AArch64::FPR128RegClass;
7260     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7261                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7262                                  RC);
7263     break;
7264   case MachineCombinerPattern::MULSUBv8i16_OP2:
7265     Opc = AArch64::MLSv8i16;
7266     RC = &AArch64::FPR128RegClass;
7267     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7268     break;
7269   case MachineCombinerPattern::MULSUBv2i32_OP1:
7270     Opc = AArch64::MLAv2i32;
7271     RC = &AArch64::FPR64RegClass;
7272     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7273                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7274                                  RC);
7275     break;
7276   case MachineCombinerPattern::MULSUBv2i32_OP2:
7277     Opc = AArch64::MLSv2i32;
7278     RC = &AArch64::FPR64RegClass;
7279     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7280     break;
7281   case MachineCombinerPattern::MULSUBv4i32_OP1:
7282     Opc = AArch64::MLAv4i32;
7283     RC = &AArch64::FPR128RegClass;
7284     MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7285                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7286                                  RC);
7287     break;
7288   case MachineCombinerPattern::MULSUBv4i32_OP2:
7289     Opc = AArch64::MLSv4i32;
7290     RC = &AArch64::FPR128RegClass;
7291     MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7292     break;
7293 
7294   case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7295     Opc = AArch64::MLAv4i16_indexed;
7296     RC = &AArch64::FPR64RegClass;
7297     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7298     break;
7299   case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7300     Opc = AArch64::MLAv4i16_indexed;
7301     RC = &AArch64::FPR64RegClass;
7302     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7303     break;
7304   case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7305     Opc = AArch64::MLAv8i16_indexed;
7306     RC = &AArch64::FPR128RegClass;
7307     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7308     break;
7309   case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7310     Opc = AArch64::MLAv8i16_indexed;
7311     RC = &AArch64::FPR128RegClass;
7312     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7313     break;
7314   case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7315     Opc = AArch64::MLAv2i32_indexed;
7316     RC = &AArch64::FPR64RegClass;
7317     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7318     break;
7319   case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7320     Opc = AArch64::MLAv2i32_indexed;
7321     RC = &AArch64::FPR64RegClass;
7322     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7323     break;
7324   case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7325     Opc = AArch64::MLAv4i32_indexed;
7326     RC = &AArch64::FPR128RegClass;
7327     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7328     break;
7329   case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7330     Opc = AArch64::MLAv4i32_indexed;
7331     RC = &AArch64::FPR128RegClass;
7332     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7333     break;
7334 
7335   case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7336     Opc = AArch64::MLAv4i16_indexed;
7337     RC = &AArch64::FPR64RegClass;
7338     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7339                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7340                                  RC);
7341     break;
7342   case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7343     Opc = AArch64::MLSv4i16_indexed;
7344     RC = &AArch64::FPR64RegClass;
7345     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7346     break;
7347   case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7348     Opc = AArch64::MLAv8i16_indexed;
7349     RC = &AArch64::FPR128RegClass;
7350     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7351                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7352                                  RC);
7353     break;
7354   case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7355     Opc = AArch64::MLSv8i16_indexed;
7356     RC = &AArch64::FPR128RegClass;
7357     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7358     break;
7359   case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7360     Opc = AArch64::MLAv2i32_indexed;
7361     RC = &AArch64::FPR64RegClass;
7362     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7363                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7364                                  RC);
7365     break;
7366   case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7367     Opc = AArch64::MLSv2i32_indexed;
7368     RC = &AArch64::FPR64RegClass;
7369     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7370     break;
7371   case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7372     Opc = AArch64::MLAv4i32_indexed;
7373     RC = &AArch64::FPR128RegClass;
7374     MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7375                                  InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7376                                  RC);
7377     break;
7378   case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7379     Opc = AArch64::MLSv4i32_indexed;
7380     RC = &AArch64::FPR128RegClass;
7381     MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7382     break;
7383 
7384   // Floating Point Support
7385   case MachineCombinerPattern::FMULADDH_OP1:
7386     Opc = AArch64::FMADDHrrr;
7387     RC = &AArch64::FPR16RegClass;
7388     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7389     break;
7390   case MachineCombinerPattern::FMULADDS_OP1:
7391     Opc = AArch64::FMADDSrrr;
7392     RC = &AArch64::FPR32RegClass;
7393     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7394     break;
7395   case MachineCombinerPattern::FMULADDD_OP1:
7396     Opc = AArch64::FMADDDrrr;
7397     RC = &AArch64::FPR64RegClass;
7398     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7399     break;
7400 
7401   case MachineCombinerPattern::FMULADDH_OP2:
7402     Opc = AArch64::FMADDHrrr;
7403     RC = &AArch64::FPR16RegClass;
7404     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7405     break;
7406   case MachineCombinerPattern::FMULADDS_OP2:
7407     Opc = AArch64::FMADDSrrr;
7408     RC = &AArch64::FPR32RegClass;
7409     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7410     break;
7411   case MachineCombinerPattern::FMULADDD_OP2:
7412     Opc = AArch64::FMADDDrrr;
7413     RC = &AArch64::FPR64RegClass;
7414     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7415     break;
7416 
7417   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7418     Opc = AArch64::FMLAv1i32_indexed;
7419     RC = &AArch64::FPR32RegClass;
7420     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7421                            FMAInstKind::Indexed);
7422     break;
7423   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7424     Opc = AArch64::FMLAv1i32_indexed;
7425     RC = &AArch64::FPR32RegClass;
7426     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7427                            FMAInstKind::Indexed);
7428     break;
7429 
7430   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7431     Opc = AArch64::FMLAv1i64_indexed;
7432     RC = &AArch64::FPR64RegClass;
7433     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7434                            FMAInstKind::Indexed);
7435     break;
7436   case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7437     Opc = AArch64::FMLAv1i64_indexed;
7438     RC = &AArch64::FPR64RegClass;
7439     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7440                            FMAInstKind::Indexed);
7441     break;
7442 
7443   case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7444     RC = &AArch64::FPR64RegClass;
7445     Opc = AArch64::FMLAv4i16_indexed;
7446     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7447                            FMAInstKind::Indexed);
7448     break;
7449   case MachineCombinerPattern::FMLAv4f16_OP1:
7450     RC = &AArch64::FPR64RegClass;
7451     Opc = AArch64::FMLAv4f16;
7452     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7453                            FMAInstKind::Accumulator);
7454     break;
7455   case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7456     RC = &AArch64::FPR64RegClass;
7457     Opc = AArch64::FMLAv4i16_indexed;
7458     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7459                            FMAInstKind::Indexed);
7460     break;
7461   case MachineCombinerPattern::FMLAv4f16_OP2:
7462     RC = &AArch64::FPR64RegClass;
7463     Opc = AArch64::FMLAv4f16;
7464     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7465                            FMAInstKind::Accumulator);
7466     break;
7467 
7468   case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7469   case MachineCombinerPattern::FMLAv2f32_OP1:
7470     RC = &AArch64::FPR64RegClass;
7471     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7472       Opc = AArch64::FMLAv2i32_indexed;
7473       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7474                              FMAInstKind::Indexed);
7475     } else {
7476       Opc = AArch64::FMLAv2f32;
7477       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7478                              FMAInstKind::Accumulator);
7479     }
7480     break;
7481   case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7482   case MachineCombinerPattern::FMLAv2f32_OP2:
7483     RC = &AArch64::FPR64RegClass;
7484     if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7485       Opc = AArch64::FMLAv2i32_indexed;
7486       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7487                              FMAInstKind::Indexed);
7488     } else {
7489       Opc = AArch64::FMLAv2f32;
7490       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7491                              FMAInstKind::Accumulator);
7492     }
7493     break;
7494 
7495   case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7496     RC = &AArch64::FPR128RegClass;
7497     Opc = AArch64::FMLAv8i16_indexed;
7498     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7499                            FMAInstKind::Indexed);
7500     break;
7501   case MachineCombinerPattern::FMLAv8f16_OP1:
7502     RC = &AArch64::FPR128RegClass;
7503     Opc = AArch64::FMLAv8f16;
7504     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7505                            FMAInstKind::Accumulator);
7506     break;
7507   case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7508     RC = &AArch64::FPR128RegClass;
7509     Opc = AArch64::FMLAv8i16_indexed;
7510     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7511                            FMAInstKind::Indexed);
7512     break;
7513   case MachineCombinerPattern::FMLAv8f16_OP2:
7514     RC = &AArch64::FPR128RegClass;
7515     Opc = AArch64::FMLAv8f16;
7516     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7517                            FMAInstKind::Accumulator);
7518     break;
7519 
7520   case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7521   case MachineCombinerPattern::FMLAv2f64_OP1:
7522     RC = &AArch64::FPR128RegClass;
7523     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7524       Opc = AArch64::FMLAv2i64_indexed;
7525       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7526                              FMAInstKind::Indexed);
7527     } else {
7528       Opc = AArch64::FMLAv2f64;
7529       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7530                              FMAInstKind::Accumulator);
7531     }
7532     break;
7533   case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7534   case MachineCombinerPattern::FMLAv2f64_OP2:
7535     RC = &AArch64::FPR128RegClass;
7536     if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7537       Opc = AArch64::FMLAv2i64_indexed;
7538       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7539                              FMAInstKind::Indexed);
7540     } else {
7541       Opc = AArch64::FMLAv2f64;
7542       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7543                              FMAInstKind::Accumulator);
7544     }
7545     break;
7546 
7547   case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7548   case MachineCombinerPattern::FMLAv4f32_OP1:
7549     RC = &AArch64::FPR128RegClass;
7550     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7551       Opc = AArch64::FMLAv4i32_indexed;
7552       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7553                              FMAInstKind::Indexed);
7554     } else {
7555       Opc = AArch64::FMLAv4f32;
7556       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7557                              FMAInstKind::Accumulator);
7558     }
7559     break;
7560 
7561   case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7562   case MachineCombinerPattern::FMLAv4f32_OP2:
7563     RC = &AArch64::FPR128RegClass;
7564     if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7565       Opc = AArch64::FMLAv4i32_indexed;
7566       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7567                              FMAInstKind::Indexed);
7568     } else {
7569       Opc = AArch64::FMLAv4f32;
7570       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7571                              FMAInstKind::Accumulator);
7572     }
7573     break;
7574 
7575   case MachineCombinerPattern::FMULSUBH_OP1:
7576     Opc = AArch64::FNMSUBHrrr;
7577     RC = &AArch64::FPR16RegClass;
7578     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7579     break;
7580   case MachineCombinerPattern::FMULSUBS_OP1:
7581     Opc = AArch64::FNMSUBSrrr;
7582     RC = &AArch64::FPR32RegClass;
7583     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7584     break;
7585   case MachineCombinerPattern::FMULSUBD_OP1:
7586     Opc = AArch64::FNMSUBDrrr;
7587     RC = &AArch64::FPR64RegClass;
7588     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7589     break;
7590 
7591   case MachineCombinerPattern::FNMULSUBH_OP1:
7592     Opc = AArch64::FNMADDHrrr;
7593     RC = &AArch64::FPR16RegClass;
7594     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7595     break;
7596   case MachineCombinerPattern::FNMULSUBS_OP1:
7597     Opc = AArch64::FNMADDSrrr;
7598     RC = &AArch64::FPR32RegClass;
7599     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7600     break;
7601   case MachineCombinerPattern::FNMULSUBD_OP1:
7602     Opc = AArch64::FNMADDDrrr;
7603     RC = &AArch64::FPR64RegClass;
7604     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7605     break;
7606 
7607   case MachineCombinerPattern::FMULSUBH_OP2:
7608     Opc = AArch64::FMSUBHrrr;
7609     RC = &AArch64::FPR16RegClass;
7610     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7611     break;
7612   case MachineCombinerPattern::FMULSUBS_OP2:
7613     Opc = AArch64::FMSUBSrrr;
7614     RC = &AArch64::FPR32RegClass;
7615     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7616     break;
7617   case MachineCombinerPattern::FMULSUBD_OP2:
7618     Opc = AArch64::FMSUBDrrr;
7619     RC = &AArch64::FPR64RegClass;
7620     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7621     break;
7622 
7623   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7624     Opc = AArch64::FMLSv1i32_indexed;
7625     RC = &AArch64::FPR32RegClass;
7626     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7627                            FMAInstKind::Indexed);
7628     break;
7629 
7630   case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7631     Opc = AArch64::FMLSv1i64_indexed;
7632     RC = &AArch64::FPR64RegClass;
7633     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7634                            FMAInstKind::Indexed);
7635     break;
7636 
7637   case MachineCombinerPattern::FMLSv4f16_OP1:
7638   case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7639     RC = &AArch64::FPR64RegClass;
7640     Register NewVR = MRI.createVirtualRegister(RC);
7641     MachineInstrBuilder MIB1 =
7642         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7643             .add(Root.getOperand(2));
7644     InsInstrs.push_back(MIB1);
7645     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7646     if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
7647       Opc = AArch64::FMLAv4f16;
7648       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7649                              FMAInstKind::Accumulator, &NewVR);
7650     } else {
7651       Opc = AArch64::FMLAv4i16_indexed;
7652       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7653                              FMAInstKind::Indexed, &NewVR);
7654     }
7655     break;
7656   }
7657   case MachineCombinerPattern::FMLSv4f16_OP2:
7658     RC = &AArch64::FPR64RegClass;
7659     Opc = AArch64::FMLSv4f16;
7660     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7661                            FMAInstKind::Accumulator);
7662     break;
7663   case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7664     RC = &AArch64::FPR64RegClass;
7665     Opc = AArch64::FMLSv4i16_indexed;
7666     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7667                            FMAInstKind::Indexed);
7668     break;
7669 
7670   case MachineCombinerPattern::FMLSv2f32_OP2:
7671   case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7672     RC = &AArch64::FPR64RegClass;
7673     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7674       Opc = AArch64::FMLSv2i32_indexed;
7675       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7676                              FMAInstKind::Indexed);
7677     } else {
7678       Opc = AArch64::FMLSv2f32;
7679       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7680                              FMAInstKind::Accumulator);
7681     }
7682     break;
7683 
7684   case MachineCombinerPattern::FMLSv8f16_OP1:
7685   case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7686     RC = &AArch64::FPR128RegClass;
7687     Register NewVR = MRI.createVirtualRegister(RC);
7688     MachineInstrBuilder MIB1 =
7689         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7690             .add(Root.getOperand(2));
7691     InsInstrs.push_back(MIB1);
7692     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7693     if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
7694       Opc = AArch64::FMLAv8f16;
7695       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7696                              FMAInstKind::Accumulator, &NewVR);
7697     } else {
7698       Opc = AArch64::FMLAv8i16_indexed;
7699       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7700                              FMAInstKind::Indexed, &NewVR);
7701     }
7702     break;
7703   }
7704   case MachineCombinerPattern::FMLSv8f16_OP2:
7705     RC = &AArch64::FPR128RegClass;
7706     Opc = AArch64::FMLSv8f16;
7707     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7708                            FMAInstKind::Accumulator);
7709     break;
7710   case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7711     RC = &AArch64::FPR128RegClass;
7712     Opc = AArch64::FMLSv8i16_indexed;
7713     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7714                            FMAInstKind::Indexed);
7715     break;
7716 
7717   case MachineCombinerPattern::FMLSv2f64_OP2:
7718   case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7719     RC = &AArch64::FPR128RegClass;
7720     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7721       Opc = AArch64::FMLSv2i64_indexed;
7722       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7723                              FMAInstKind::Indexed);
7724     } else {
7725       Opc = AArch64::FMLSv2f64;
7726       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7727                              FMAInstKind::Accumulator);
7728     }
7729     break;
7730 
7731   case MachineCombinerPattern::FMLSv4f32_OP2:
7732   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7733     RC = &AArch64::FPR128RegClass;
7734     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7735       Opc = AArch64::FMLSv4i32_indexed;
7736       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7737                              FMAInstKind::Indexed);
7738     } else {
7739       Opc = AArch64::FMLSv4f32;
7740       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7741                              FMAInstKind::Accumulator);
7742     }
7743     break;
7744   case MachineCombinerPattern::FMLSv2f32_OP1:
7745   case MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7746     RC = &AArch64::FPR64RegClass;
7747     Register NewVR = MRI.createVirtualRegister(RC);
7748     MachineInstrBuilder MIB1 =
7749         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7750             .add(Root.getOperand(2));
7751     InsInstrs.push_back(MIB1);
7752     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7753     if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7754       Opc = AArch64::FMLAv2i32_indexed;
7755       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7756                              FMAInstKind::Indexed, &NewVR);
7757     } else {
7758       Opc = AArch64::FMLAv2f32;
7759       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7760                              FMAInstKind::Accumulator, &NewVR);
7761     }
7762     break;
7763   }
7764   case MachineCombinerPattern::FMLSv4f32_OP1:
7765   case MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7766     RC = &AArch64::FPR128RegClass;
7767     Register NewVR = MRI.createVirtualRegister(RC);
7768     MachineInstrBuilder MIB1 =
7769         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7770             .add(Root.getOperand(2));
7771     InsInstrs.push_back(MIB1);
7772     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7773     if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7774       Opc = AArch64::FMLAv4i32_indexed;
7775       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7776                              FMAInstKind::Indexed, &NewVR);
7777     } else {
7778       Opc = AArch64::FMLAv4f32;
7779       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7780                              FMAInstKind::Accumulator, &NewVR);
7781     }
7782     break;
7783   }
7784   case MachineCombinerPattern::FMLSv2f64_OP1:
7785   case MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7786     RC = &AArch64::FPR128RegClass;
7787     Register NewVR = MRI.createVirtualRegister(RC);
7788     MachineInstrBuilder MIB1 =
7789         BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7790             .add(Root.getOperand(2));
7791     InsInstrs.push_back(MIB1);
7792     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7793     if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7794       Opc = AArch64::FMLAv2i64_indexed;
7795       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7796                              FMAInstKind::Indexed, &NewVR);
7797     } else {
7798       Opc = AArch64::FMLAv2f64;
7799       MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7800                              FMAInstKind::Accumulator, &NewVR);
7801     }
7802     break;
7803   }
7804   case MachineCombinerPattern::FMULv2i32_indexed_OP1:
7805   case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7806     unsigned IdxDupOp =
7807         (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2;
7808     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7809                        &AArch64::FPR128RegClass, MRI);
7810     break;
7811   }
7812   case MachineCombinerPattern::FMULv2i64_indexed_OP1:
7813   case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7814     unsigned IdxDupOp =
7815         (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2;
7816     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7817                        &AArch64::FPR128RegClass, MRI);
7818     break;
7819   }
7820   case MachineCombinerPattern::FMULv4i16_indexed_OP1:
7821   case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7822     unsigned IdxDupOp =
7823         (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2;
7824     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7825                        &AArch64::FPR128_loRegClass, MRI);
7826     break;
7827   }
7828   case MachineCombinerPattern::FMULv4i32_indexed_OP1:
7829   case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7830     unsigned IdxDupOp =
7831         (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2;
7832     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7833                        &AArch64::FPR128RegClass, MRI);
7834     break;
7835   }
7836   case MachineCombinerPattern::FMULv8i16_indexed_OP1:
7837   case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7838     unsigned IdxDupOp =
7839         (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2;
7840     genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7841                        &AArch64::FPR128_loRegClass, MRI);
7842     break;
7843   }
7844   case MachineCombinerPattern::FNMADD: {
7845     MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7846     break;
7847   }
7848 
7849   } // end switch (Pattern)
7850   // Record MUL and ADD/SUB for deletion
7851   if (MUL)
7852     DelInstrs.push_back(MUL);
7853   DelInstrs.push_back(&Root);
7854 
7855   // Set the flags on the inserted instructions to be the merged flags of the
7856   // instructions that we have combined.
7857   uint32_t Flags = Root.getFlags();
7858   if (MUL)
7859     Flags = Root.mergeFlagsWith(*MUL);
7860   for (auto *MI : InsInstrs)
7861     MI->setFlags(Flags);
7862 }
7863 
7864 /// Replace csincr-branch sequence by simple conditional branch
7865 ///
7866 /// Examples:
7867 /// 1. \code
7868 ///   csinc  w9, wzr, wzr, <condition code>
7869 ///   tbnz   w9, #0, 0x44
7870 ///    \endcode
7871 /// to
7872 ///    \code
7873 ///   b.<inverted condition code>
7874 ///    \endcode
7875 ///
7876 /// 2. \code
7877 ///   csinc w9, wzr, wzr, <condition code>
7878 ///   tbz   w9, #0, 0x44
7879 ///    \endcode
7880 /// to
7881 ///    \code
7882 ///   b.<condition code>
7883 ///    \endcode
7884 ///
7885 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7886 /// compare's constant operand is power of 2.
7887 ///
7888 /// Examples:
7889 ///    \code
7890 ///   and  w8, w8, #0x400
7891 ///   cbnz w8, L1
7892 ///    \endcode
7893 /// to
7894 ///    \code
7895 ///   tbnz w8, #10, L1
7896 ///    \endcode
7897 ///
7898 /// \param  MI Conditional Branch
7899 /// \return True when the simple conditional branch is generated
7900 ///
7901 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7902   bool IsNegativeBranch = false;
7903   bool IsTestAndBranch = false;
7904   unsigned TargetBBInMI = 0;
7905   switch (MI.getOpcode()) {
7906   default:
7907     llvm_unreachable("Unknown branch instruction?");
7908   case AArch64::Bcc:
7909     return false;
7910   case AArch64::CBZW:
7911   case AArch64::CBZX:
7912     TargetBBInMI = 1;
7913     break;
7914   case AArch64::CBNZW:
7915   case AArch64::CBNZX:
7916     TargetBBInMI = 1;
7917     IsNegativeBranch = true;
7918     break;
7919   case AArch64::TBZW:
7920   case AArch64::TBZX:
7921     TargetBBInMI = 2;
7922     IsTestAndBranch = true;
7923     break;
7924   case AArch64::TBNZW:
7925   case AArch64::TBNZX:
7926     TargetBBInMI = 2;
7927     IsNegativeBranch = true;
7928     IsTestAndBranch = true;
7929     break;
7930   }
7931   // So we increment a zero register and test for bits other
7932   // than bit 0? Conservatively bail out in case the verifier
7933   // missed this case.
7934   if (IsTestAndBranch && MI.getOperand(1).getImm())
7935     return false;
7936 
7937   // Find Definition.
7938   assert(MI.getParent() && "Incomplete machine instruciton\n");
7939   MachineBasicBlock *MBB = MI.getParent();
7940   MachineFunction *MF = MBB->getParent();
7941   MachineRegisterInfo *MRI = &MF->getRegInfo();
7942   Register VReg = MI.getOperand(0).getReg();
7943   if (!VReg.isVirtual())
7944     return false;
7945 
7946   MachineInstr *DefMI = MRI->getVRegDef(VReg);
7947 
7948   // Look through COPY instructions to find definition.
7949   while (DefMI->isCopy()) {
7950     Register CopyVReg = DefMI->getOperand(1).getReg();
7951     if (!MRI->hasOneNonDBGUse(CopyVReg))
7952       return false;
7953     if (!MRI->hasOneDef(CopyVReg))
7954       return false;
7955     DefMI = MRI->getVRegDef(CopyVReg);
7956   }
7957 
7958   switch (DefMI->getOpcode()) {
7959   default:
7960     return false;
7961   // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
7962   case AArch64::ANDWri:
7963   case AArch64::ANDXri: {
7964     if (IsTestAndBranch)
7965       return false;
7966     if (DefMI->getParent() != MBB)
7967       return false;
7968     if (!MRI->hasOneNonDBGUse(VReg))
7969       return false;
7970 
7971     bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
7972     uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
7973         DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
7974     if (!isPowerOf2_64(Mask))
7975       return false;
7976 
7977     MachineOperand &MO = DefMI->getOperand(1);
7978     Register NewReg = MO.getReg();
7979     if (!NewReg.isVirtual())
7980       return false;
7981 
7982     assert(!MRI->def_empty(NewReg) && "Register must be defined.");
7983 
7984     MachineBasicBlock &RefToMBB = *MBB;
7985     MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
7986     DebugLoc DL = MI.getDebugLoc();
7987     unsigned Imm = Log2_64(Mask);
7988     unsigned Opc = (Imm < 32)
7989                        ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
7990                        : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
7991     MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
7992                               .addReg(NewReg)
7993                               .addImm(Imm)
7994                               .addMBB(TBB);
7995     // Register lives on to the CBZ now.
7996     MO.setIsKill(false);
7997 
7998     // For immediate smaller than 32, we need to use the 32-bit
7999     // variant (W) in all cases. Indeed the 64-bit variant does not
8000     // allow to encode them.
8001     // Therefore, if the input register is 64-bit, we need to take the
8002     // 32-bit sub-part.
8003     if (!Is32Bit && Imm < 32)
8004       NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8005     MI.eraseFromParent();
8006     return true;
8007   }
8008   // Look for CSINC
8009   case AArch64::CSINCWr:
8010   case AArch64::CSINCXr: {
8011     if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8012           DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8013         !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8014           DefMI->getOperand(2).getReg() == AArch64::XZR))
8015       return false;
8016 
8017     if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
8018       return false;
8019 
8020     AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8021     // Convert only when the condition code is not modified between
8022     // the CSINC and the branch. The CC may be used by other
8023     // instructions in between.
8024     if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8025       return false;
8026     MachineBasicBlock &RefToMBB = *MBB;
8027     MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8028     DebugLoc DL = MI.getDebugLoc();
8029     if (IsNegativeBranch)
8030       CC = AArch64CC::getInvertedCondCode(CC);
8031     BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8032     MI.eraseFromParent();
8033     return true;
8034   }
8035   }
8036 }
8037 
8038 std::pair<unsigned, unsigned>
8039 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8040   const unsigned Mask = AArch64II::MO_FRAGMENT;
8041   return std::make_pair(TF & Mask, TF & ~Mask);
8042 }
8043 
8044 ArrayRef<std::pair<unsigned, const char *>>
8045 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8046   using namespace AArch64II;
8047 
8048   static const std::pair<unsigned, const char *> TargetFlags[] = {
8049       {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8050       {MO_G3, "aarch64-g3"},     {MO_G2, "aarch64-g2"},
8051       {MO_G1, "aarch64-g1"},     {MO_G0, "aarch64-g0"},
8052       {MO_HI12, "aarch64-hi12"}};
8053   return ArrayRef(TargetFlags);
8054 }
8055 
8056 ArrayRef<std::pair<unsigned, const char *>>
8057 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8058   using namespace AArch64II;
8059 
8060   static const std::pair<unsigned, const char *> TargetFlags[] = {
8061       {MO_COFFSTUB, "aarch64-coffstub"},
8062       {MO_GOT, "aarch64-got"},
8063       {MO_NC, "aarch64-nc"},
8064       {MO_S, "aarch64-s"},
8065       {MO_TLS, "aarch64-tls"},
8066       {MO_DLLIMPORT, "aarch64-dllimport"},
8067       {MO_DLLIMPORTAUX, "aarch64-dllimportaux"},
8068       {MO_PREL, "aarch64-prel"},
8069       {MO_TAGGED, "aarch64-tagged"}};
8070   return ArrayRef(TargetFlags);
8071 }
8072 
8073 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8074 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8075   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8076       {{MOSuppressPair, "aarch64-suppress-pair"},
8077        {MOStridedAccess, "aarch64-strided-access"}};
8078   return ArrayRef(TargetFlags);
8079 }
8080 
8081 /// Constants defining how certain sequences should be outlined.
8082 /// This encompasses how an outlined function should be called, and what kind of
8083 /// frame should be emitted for that outlined function.
8084 ///
8085 /// \p MachineOutlinerDefault implies that the function should be called with
8086 /// a save and restore of LR to the stack.
8087 ///
8088 /// That is,
8089 ///
8090 /// I1     Save LR                    OUTLINED_FUNCTION:
8091 /// I2 --> BL OUTLINED_FUNCTION       I1
8092 /// I3     Restore LR                 I2
8093 ///                                   I3
8094 ///                                   RET
8095 ///
8096 /// * Call construction overhead: 3 (save + BL + restore)
8097 /// * Frame construction overhead: 1 (ret)
8098 /// * Requires stack fixups? Yes
8099 ///
8100 /// \p MachineOutlinerTailCall implies that the function is being created from
8101 /// a sequence of instructions ending in a return.
8102 ///
8103 /// That is,
8104 ///
8105 /// I1                             OUTLINED_FUNCTION:
8106 /// I2 --> B OUTLINED_FUNCTION     I1
8107 /// RET                            I2
8108 ///                                RET
8109 ///
8110 /// * Call construction overhead: 1 (B)
8111 /// * Frame construction overhead: 0 (Return included in sequence)
8112 /// * Requires stack fixups? No
8113 ///
8114 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8115 /// a BL instruction, but doesn't require LR to be saved and restored. This
8116 /// happens when LR is known to be dead.
8117 ///
8118 /// That is,
8119 ///
8120 /// I1                                OUTLINED_FUNCTION:
8121 /// I2 --> BL OUTLINED_FUNCTION       I1
8122 /// I3                                I2
8123 ///                                   I3
8124 ///                                   RET
8125 ///
8126 /// * Call construction overhead: 1 (BL)
8127 /// * Frame construction overhead: 1 (RET)
8128 /// * Requires stack fixups? No
8129 ///
8130 /// \p MachineOutlinerThunk implies that the function is being created from
8131 /// a sequence of instructions ending in a call. The outlined function is
8132 /// called with a BL instruction, and the outlined function tail-calls the
8133 /// original call destination.
8134 ///
8135 /// That is,
8136 ///
8137 /// I1                                OUTLINED_FUNCTION:
8138 /// I2 --> BL OUTLINED_FUNCTION       I1
8139 /// BL f                              I2
8140 ///                                   B f
8141 /// * Call construction overhead: 1 (BL)
8142 /// * Frame construction overhead: 0
8143 /// * Requires stack fixups? No
8144 ///
8145 /// \p MachineOutlinerRegSave implies that the function should be called with a
8146 /// save and restore of LR to an available register. This allows us to avoid
8147 /// stack fixups. Note that this outlining variant is compatible with the
8148 /// NoLRSave case.
8149 ///
8150 /// That is,
8151 ///
8152 /// I1     Save LR                    OUTLINED_FUNCTION:
8153 /// I2 --> BL OUTLINED_FUNCTION       I1
8154 /// I3     Restore LR                 I2
8155 ///                                   I3
8156 ///                                   RET
8157 ///
8158 /// * Call construction overhead: 3 (save + BL + restore)
8159 /// * Frame construction overhead: 1 (ret)
8160 /// * Requires stack fixups? No
8161 enum MachineOutlinerClass {
8162   MachineOutlinerDefault,  /// Emit a save, restore, call, and return.
8163   MachineOutlinerTailCall, /// Only emit a branch.
8164   MachineOutlinerNoLRSave, /// Emit a call and return.
8165   MachineOutlinerThunk,    /// Emit a call and tail-call.
8166   MachineOutlinerRegSave   /// Same as default, but save to a register.
8167 };
8168 
8169 enum MachineOutlinerMBBFlags {
8170   LRUnavailableSomewhere = 0x2,
8171   HasCalls = 0x4,
8172   UnsafeRegsDead = 0x8
8173 };
8174 
8175 Register
8176 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8177   MachineFunction *MF = C.getMF();
8178   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8179   const AArch64RegisterInfo *ARI =
8180       static_cast<const AArch64RegisterInfo *>(&TRI);
8181   // Check if there is an available register across the sequence that we can
8182   // use.
8183   for (unsigned Reg : AArch64::GPR64RegClass) {
8184     if (!ARI->isReservedReg(*MF, Reg) &&
8185         Reg != AArch64::LR &&  // LR is not reserved, but don't use it.
8186         Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8187         Reg != AArch64::X17 && // Ditto for X17.
8188         C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8189         C.isAvailableInsideSeq(Reg, TRI))
8190       return Reg;
8191   }
8192   return Register();
8193 }
8194 
8195 static bool
8196 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8197                                          const outliner::Candidate &b) {
8198   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8199   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8200 
8201   return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8202          MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8203 }
8204 
8205 static bool
8206 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8207                                        const outliner::Candidate &b) {
8208   const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8209   const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8210 
8211   return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8212 }
8213 
8214 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8215                                                 const outliner::Candidate &b) {
8216   const AArch64Subtarget &SubtargetA =
8217       a.getMF()->getSubtarget<AArch64Subtarget>();
8218   const AArch64Subtarget &SubtargetB =
8219       b.getMF()->getSubtarget<AArch64Subtarget>();
8220   return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8221 }
8222 
8223 std::optional<outliner::OutlinedFunction>
8224 AArch64InstrInfo::getOutliningCandidateInfo(
8225     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8226   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8227   unsigned SequenceSize =
8228       std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
8229                       [this](unsigned Sum, const MachineInstr &MI) {
8230                         return Sum + getInstSizeInBytes(MI);
8231                       });
8232   unsigned NumBytesToCreateFrame = 0;
8233 
8234   // We only allow outlining for functions having exactly matching return
8235   // address signing attributes, i.e., all share the same value for the
8236   // attribute "sign-return-address" and all share the same type of key they
8237   // are signed with.
8238   // Additionally we require all functions to simultaniously either support
8239   // v8.3a features or not. Otherwise an outlined function could get signed
8240   // using dedicated v8.3 instructions and a call from a function that doesn't
8241   // support v8.3 instructions would therefore be invalid.
8242   if (std::adjacent_find(
8243           RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8244           [](const outliner::Candidate &a, const outliner::Candidate &b) {
8245             // Return true if a and b are non-equal w.r.t. return address
8246             // signing or support of v8.3a features
8247             if (outliningCandidatesSigningScopeConsensus(a, b) &&
8248                 outliningCandidatesSigningKeyConsensus(a, b) &&
8249                 outliningCandidatesV8_3OpsConsensus(a, b)) {
8250               return false;
8251             }
8252             return true;
8253           }) != RepeatedSequenceLocs.end()) {
8254     return std::nullopt;
8255   }
8256 
8257   // Since at this point all candidates agree on their return address signing
8258   // picking just one is fine. If the candidate functions potentially sign their
8259   // return addresses, the outlined function should do the same. Note that in
8260   // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8261   // not certainly true that the outlined function will have to sign its return
8262   // address but this decision is made later, when the decision to outline
8263   // has already been made.
8264   // The same holds for the number of additional instructions we need: On
8265   // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8266   // necessary. However, at this point we don't know if the outlined function
8267   // will have a RET instruction so we assume the worst.
8268   const TargetRegisterInfo &TRI = getRegisterInfo();
8269   // Performing a tail call may require extra checks when PAuth is enabled.
8270   // If PAuth is disabled, set it to zero for uniformity.
8271   unsigned NumBytesToCheckLRInTCEpilogue = 0;
8272   if (FirstCand.getMF()
8273           ->getInfo<AArch64FunctionInfo>()
8274           ->shouldSignReturnAddress(true)) {
8275     // One PAC and one AUT instructions
8276     NumBytesToCreateFrame += 8;
8277 
8278     // PAuth is enabled - set extra tail call cost, if any.
8279     auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
8280     NumBytesToCheckLRInTCEpilogue =
8281         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8282     // Checking the authenticated LR value may significantly impact
8283     // SequenceSize, so account for it for more precise results.
8284     if (isTailCallReturnInst(*RepeatedSequenceLocs[0].back()))
8285       SequenceSize += NumBytesToCheckLRInTCEpilogue;
8286 
8287     // We have to check if sp modifying instructions would get outlined.
8288     // If so we only allow outlining if sp is unchanged overall, so matching
8289     // sub and add instructions are okay to outline, all other sp modifications
8290     // are not
8291     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8292       int SPValue = 0;
8293       MachineBasicBlock::iterator MBBI = C.front();
8294       for (;;) {
8295         if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
8296           switch (MBBI->getOpcode()) {
8297           case AArch64::ADDXri:
8298           case AArch64::ADDWri:
8299             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
8300             assert(MBBI->getOperand(2).isImm() &&
8301                    "Expected operand to be immediate");
8302             assert(MBBI->getOperand(1).isReg() &&
8303                    "Expected operand to be a register");
8304             // Check if the add just increments sp. If so, we search for
8305             // matching sub instructions that decrement sp. If not, the
8306             // modification is illegal
8307             if (MBBI->getOperand(1).getReg() == AArch64::SP)
8308               SPValue += MBBI->getOperand(2).getImm();
8309             else
8310               return true;
8311             break;
8312           case AArch64::SUBXri:
8313           case AArch64::SUBWri:
8314             assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
8315             assert(MBBI->getOperand(2).isImm() &&
8316                    "Expected operand to be immediate");
8317             assert(MBBI->getOperand(1).isReg() &&
8318                    "Expected operand to be a register");
8319             // Check if the sub just decrements sp. If so, we search for
8320             // matching add instructions that increment sp. If not, the
8321             // modification is illegal
8322             if (MBBI->getOperand(1).getReg() == AArch64::SP)
8323               SPValue -= MBBI->getOperand(2).getImm();
8324             else
8325               return true;
8326             break;
8327           default:
8328             return true;
8329           }
8330         }
8331         if (MBBI == C.back())
8332           break;
8333         ++MBBI;
8334       }
8335       if (SPValue)
8336         return true;
8337       return false;
8338     };
8339     // Remove candidates with illegal stack modifying instructions
8340     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8341 
8342     // If the sequence doesn't have enough candidates left, then we're done.
8343     if (RepeatedSequenceLocs.size() < 2)
8344       return std::nullopt;
8345   }
8346 
8347   // Properties about candidate MBBs that hold for all of them.
8348   unsigned FlagsSetInAll = 0xF;
8349 
8350   // Compute liveness information for each candidate, and set FlagsSetInAll.
8351   for (outliner::Candidate &C : RepeatedSequenceLocs)
8352     FlagsSetInAll &= C.Flags;
8353 
8354   unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
8355 
8356   // Helper lambda which sets call information for every candidate.
8357   auto SetCandidateCallInfo =
8358       [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8359         for (outliner::Candidate &C : RepeatedSequenceLocs)
8360           C.setCallInfo(CallID, NumBytesForCall);
8361       };
8362 
8363   unsigned FrameID = MachineOutlinerDefault;
8364   NumBytesToCreateFrame += 4;
8365 
8366   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8367     return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8368   });
8369 
8370   // We check to see if CFI Instructions are present, and if they are
8371   // we find the number of CFI Instructions in the candidates.
8372   unsigned CFICount = 0;
8373   for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
8374                             std::next(RepeatedSequenceLocs[0].back()))) {
8375     if (I.isCFIInstruction())
8376       CFICount++;
8377   }
8378 
8379   // We compare the number of found CFI Instructions to  the number of CFI
8380   // instructions in the parent function for each candidate.  We must check this
8381   // since if we outline one of the CFI instructions in a function, we have to
8382   // outline them all for correctness. If we do not, the address offsets will be
8383   // incorrect between the two sections of the program.
8384   for (outliner::Candidate &C : RepeatedSequenceLocs) {
8385     std::vector<MCCFIInstruction> CFIInstructions =
8386         C.getMF()->getFrameInstructions();
8387 
8388     if (CFICount > 0 && CFICount != CFIInstructions.size())
8389       return std::nullopt;
8390   }
8391 
8392   // Returns true if an instructions is safe to fix up, false otherwise.
8393   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8394     if (MI.isCall())
8395       return true;
8396 
8397     if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8398         !MI.readsRegister(AArch64::SP, &TRI))
8399       return true;
8400 
8401     // Any modification of SP will break our code to save/restore LR.
8402     // FIXME: We could handle some instructions which add a constant
8403     // offset to SP, with a bit more work.
8404     if (MI.modifiesRegister(AArch64::SP, &TRI))
8405       return false;
8406 
8407     // At this point, we have a stack instruction that we might need to
8408     // fix up. We'll handle it if it's a load or store.
8409     if (MI.mayLoadOrStore()) {
8410       const MachineOperand *Base; // Filled with the base operand of MI.
8411       int64_t Offset;             // Filled with the offset of MI.
8412       bool OffsetIsScalable;
8413 
8414       // Does it allow us to offset the base operand and is the base the
8415       // register SP?
8416       if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8417           !Base->isReg() || Base->getReg() != AArch64::SP)
8418         return false;
8419 
8420       // Fixe-up code below assumes bytes.
8421       if (OffsetIsScalable)
8422         return false;
8423 
8424       // Find the minimum/maximum offset for this instruction and check
8425       // if fixing it up would be in range.
8426       int64_t MinOffset,
8427           MaxOffset;  // Unscaled offsets for the instruction.
8428       // The scale to multiply the offsets by.
8429       TypeSize Scale(0U, false), DummyWidth(0U, false);
8430       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8431 
8432       Offset += 16; // Update the offset to what it would be if we outlined.
8433       if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8434           Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8435         return false;
8436 
8437       // It's in range, so we can outline it.
8438       return true;
8439     }
8440 
8441     // FIXME: Add handling for instructions like "add x0, sp, #8".
8442 
8443     // We can't fix it up, so don't outline it.
8444     return false;
8445   };
8446 
8447   // True if it's possible to fix up each stack instruction in this sequence.
8448   // Important for frames/call variants that modify the stack.
8449   bool AllStackInstrsSafe = std::all_of(
8450       FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
8451 
8452   // If the last instruction in any candidate is a terminator, then we should
8453   // tail call all of the candidates.
8454   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
8455     FrameID = MachineOutlinerTailCall;
8456     NumBytesToCreateFrame = 0;
8457     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8458     SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8459   }
8460 
8461   else if (LastInstrOpcode == AArch64::BL ||
8462            ((LastInstrOpcode == AArch64::BLR ||
8463              LastInstrOpcode == AArch64::BLRNoIP) &&
8464             !HasBTI)) {
8465     // FIXME: Do we need to check if the code after this uses the value of LR?
8466     FrameID = MachineOutlinerThunk;
8467     NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8468     SetCandidateCallInfo(MachineOutlinerThunk, 4);
8469   }
8470 
8471   else {
8472     // We need to decide how to emit calls + frames. We can always emit the same
8473     // frame if we don't need to save to the stack. If we have to save to the
8474     // stack, then we need a different frame.
8475     unsigned NumBytesNoStackCalls = 0;
8476     std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8477 
8478     // Check if we have to save LR.
8479     for (outliner::Candidate &C : RepeatedSequenceLocs) {
8480       bool LRAvailable =
8481           (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8482               ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8483               : true;
8484       // If we have a noreturn caller, then we're going to be conservative and
8485       // say that we have to save LR. If we don't have a ret at the end of the
8486       // block, then we can't reason about liveness accurately.
8487       //
8488       // FIXME: We can probably do better than always disabling this in
8489       // noreturn functions by fixing up the liveness info.
8490       bool IsNoReturn =
8491           C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8492 
8493       // Is LR available? If so, we don't need a save.
8494       if (LRAvailable && !IsNoReturn) {
8495         NumBytesNoStackCalls += 4;
8496         C.setCallInfo(MachineOutlinerNoLRSave, 4);
8497         CandidatesWithoutStackFixups.push_back(C);
8498       }
8499 
8500       // Is an unused register available? If so, we won't modify the stack, so
8501       // we can outline with the same frame type as those that don't save LR.
8502       else if (findRegisterToSaveLRTo(C)) {
8503         NumBytesNoStackCalls += 12;
8504         C.setCallInfo(MachineOutlinerRegSave, 12);
8505         CandidatesWithoutStackFixups.push_back(C);
8506       }
8507 
8508       // Is SP used in the sequence at all? If not, we don't have to modify
8509       // the stack, so we are guaranteed to get the same frame.
8510       else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8511         NumBytesNoStackCalls += 12;
8512         C.setCallInfo(MachineOutlinerDefault, 12);
8513         CandidatesWithoutStackFixups.push_back(C);
8514       }
8515 
8516       // If we outline this, we need to modify the stack. Pretend we don't
8517       // outline this by saving all of its bytes.
8518       else {
8519         NumBytesNoStackCalls += SequenceSize;
8520       }
8521     }
8522 
8523     // If there are no places where we have to save LR, then note that we
8524     // don't have to update the stack. Otherwise, give every candidate the
8525     // default call type, as long as it's safe to do so.
8526     if (!AllStackInstrsSafe ||
8527         NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8528       RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8529       FrameID = MachineOutlinerNoLRSave;
8530     } else {
8531       SetCandidateCallInfo(MachineOutlinerDefault, 12);
8532 
8533       // Bugzilla ID: 46767
8534       // TODO: Check if fixing up the stack more than once is safe so we can
8535       // outline these.
8536       //
8537       // An outline resulting in a caller that requires stack fixups at the
8538       // callsite to a callee that also requires stack fixups can happen when
8539       // there are no available registers at the candidate callsite for a
8540       // candidate that itself also has calls.
8541       //
8542       // In other words if function_containing_sequence in the following pseudo
8543       // assembly requires that we save LR at the point of the call, but there
8544       // are no available registers: in this case we save using SP and as a
8545       // result the SP offsets requires stack fixups by multiples of 16.
8546       //
8547       // function_containing_sequence:
8548       //   ...
8549       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8550       //   call OUTLINED_FUNCTION_N
8551       //   restore LR from SP
8552       //   ...
8553       //
8554       // OUTLINED_FUNCTION_N:
8555       //   save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8556       //   ...
8557       //   bl foo
8558       //   restore LR from SP
8559       //   ret
8560       //
8561       // Because the code to handle more than one stack fixup does not
8562       // currently have the proper checks for legality, these cases will assert
8563       // in the AArch64 MachineOutliner. This is because the code to do this
8564       // needs more hardening, testing, better checks that generated code is
8565       // legal, etc and because it is only verified to handle a single pass of
8566       // stack fixup.
8567       //
8568       // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8569       // these cases until they are known to be handled. Bugzilla 46767 is
8570       // referenced in comments at the assert site.
8571       //
8572       // To avoid asserting (or generating non-legal code on noassert builds)
8573       // we remove all candidates which would need more than one stack fixup by
8574       // pruning the cases where the candidate has calls while also having no
8575       // available LR and having no available general purpose registers to copy
8576       // LR to (ie one extra stack save/restore).
8577       //
8578       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8579         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8580           return (std::any_of(
8581                      C.front(), std::next(C.back()),
8582                      [](const MachineInstr &MI) { return MI.isCall(); })) &&
8583                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8584                   !findRegisterToSaveLRTo(C));
8585         });
8586       }
8587     }
8588 
8589     // If we dropped all of the candidates, bail out here.
8590     if (RepeatedSequenceLocs.size() < 2) {
8591       RepeatedSequenceLocs.clear();
8592       return std::nullopt;
8593     }
8594   }
8595 
8596   // Does every candidate's MBB contain a call? If so, then we might have a call
8597   // in the range.
8598   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8599     // Check if the range contains a call. These require a save + restore of the
8600     // link register.
8601     bool ModStackToSaveLR = false;
8602     if (std::any_of(FirstCand.front(), FirstCand.back(),
8603                     [](const MachineInstr &MI) { return MI.isCall(); }))
8604       ModStackToSaveLR = true;
8605 
8606     // Handle the last instruction separately. If this is a tail call, then the
8607     // last instruction is a call. We don't want to save + restore in this case.
8608     // However, it could be possible that the last instruction is a call without
8609     // it being valid to tail call this sequence. We should consider this as
8610     // well.
8611     else if (FrameID != MachineOutlinerThunk &&
8612              FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
8613       ModStackToSaveLR = true;
8614 
8615     if (ModStackToSaveLR) {
8616       // We can't fix up the stack. Bail out.
8617       if (!AllStackInstrsSafe) {
8618         RepeatedSequenceLocs.clear();
8619         return std::nullopt;
8620       }
8621 
8622       // Save + restore LR.
8623       NumBytesToCreateFrame += 8;
8624     }
8625   }
8626 
8627   // If we have CFI instructions, we can only outline if the outlined section
8628   // can be a tail call
8629   if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8630     return std::nullopt;
8631 
8632   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8633                                     NumBytesToCreateFrame, FrameID);
8634 }
8635 
8636 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8637     Function &F, std::vector<outliner::Candidate> &Candidates) const {
8638   // If a bunch of candidates reach this point they must agree on their return
8639   // address signing. It is therefore enough to just consider the signing
8640   // behaviour of one of them
8641   const auto &CFn = Candidates.front().getMF()->getFunction();
8642 
8643   // Since all candidates belong to the same module, just copy the
8644   // function-level attributes of an arbitrary function.
8645   if (CFn.hasFnAttribute("sign-return-address"))
8646     F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8647   if (CFn.hasFnAttribute("sign-return-address-key"))
8648     F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8649 
8650   AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8651 }
8652 
8653 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8654     MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8655   const Function &F = MF.getFunction();
8656 
8657   // Can F be deduplicated by the linker? If it can, don't outline from it.
8658   if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8659     return false;
8660 
8661   // Don't outline from functions with section markings; the program could
8662   // expect that all the code is in the named section.
8663   // FIXME: Allow outlining from multiple functions with the same section
8664   // marking.
8665   if (F.hasSection())
8666     return false;
8667 
8668   // Outlining from functions with redzones is unsafe since the outliner may
8669   // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8670   // outline from it.
8671   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8672   if (!AFI || AFI->hasRedZone().value_or(true))
8673     return false;
8674 
8675   // FIXME: Teach the outliner to generate/handle Windows unwind info.
8676   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8677     return false;
8678 
8679   // It's safe to outline from MF.
8680   return true;
8681 }
8682 
8683 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8684 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8685                                       unsigned &Flags) const {
8686   assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8687          "Must track liveness!");
8688   SmallVector<
8689       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8690       Ranges;
8691   // According to the AArch64 Procedure Call Standard, the following are
8692   // undefined on entry/exit from a function call:
8693   //
8694   // * Registers x16, x17, (and thus w16, w17)
8695   // * Condition codes (and thus the NZCV register)
8696   //
8697   // If any of these registers are used inside or live across an outlined
8698   // function, then they may be modified later, either by the compiler or
8699   // some other tool (like the linker).
8700   //
8701   // To avoid outlining in these situations, partition each block into ranges
8702   // where these registers are dead. We will only outline from those ranges.
8703   LiveRegUnits LRU(getRegisterInfo());
8704   auto AreAllUnsafeRegsDead = [&LRU]() {
8705     return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8706            LRU.available(AArch64::NZCV);
8707   };
8708 
8709   // We need to know if LR is live across an outlining boundary later on in
8710   // order to decide how we'll create the outlined call, frame, etc.
8711   //
8712   // It's pretty expensive to check this for *every candidate* within a block.
8713   // That's some potentially n^2 behaviour, since in the worst case, we'd need
8714   // to compute liveness from the end of the block for O(n) candidates within
8715   // the block.
8716   //
8717   // So, to improve the average case, let's keep track of liveness from the end
8718   // of the block to the beginning of *every outlinable range*. If we know that
8719   // LR is available in every range we could outline from, then we know that
8720   // we don't need to check liveness for any candidate within that range.
8721   bool LRAvailableEverywhere = true;
8722   // Compute liveness bottom-up.
8723   LRU.addLiveOuts(MBB);
8724   // Update flags that require info about the entire MBB.
8725   auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8726     if (MI.isCall() && !MI.isTerminator())
8727       Flags |= MachineOutlinerMBBFlags::HasCalls;
8728   };
8729   // Range: [RangeBegin, RangeEnd)
8730   MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8731   unsigned RangeLen;
8732   auto CreateNewRangeStartingAt =
8733       [&RangeBegin, &RangeEnd,
8734        &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8735         RangeBegin = NewBegin;
8736         RangeEnd = std::next(RangeBegin);
8737         RangeLen = 0;
8738       };
8739   auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8740     // At least one unsafe register is not dead. We do not want to outline at
8741     // this point. If it is long enough to outline from, save the range
8742     // [RangeBegin, RangeEnd).
8743     if (RangeLen > 1)
8744       Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8745   };
8746   // Find the first point where all unsafe registers are dead.
8747   // FIND: <safe instr> <-- end of first potential range
8748   // SKIP: <unsafe def>
8749   // SKIP: ... everything between ...
8750   // SKIP: <unsafe use>
8751   auto FirstPossibleEndPt = MBB.instr_rbegin();
8752   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8753     LRU.stepBackward(*FirstPossibleEndPt);
8754     // Update flags that impact how we outline across the entire block,
8755     // regardless of safety.
8756     UpdateWholeMBBFlags(*FirstPossibleEndPt);
8757     if (AreAllUnsafeRegsDead())
8758       break;
8759   }
8760   // If we exhausted the entire block, we have no safe ranges to outline.
8761   if (FirstPossibleEndPt == MBB.instr_rend())
8762     return Ranges;
8763   // Current range.
8764   CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8765   // StartPt points to the first place where all unsafe registers
8766   // are dead (if there is any such point). Begin partitioning the MBB into
8767   // ranges.
8768   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8769     LRU.stepBackward(MI);
8770     UpdateWholeMBBFlags(MI);
8771     if (!AreAllUnsafeRegsDead()) {
8772       SaveRangeIfNonEmpty();
8773       CreateNewRangeStartingAt(MI.getIterator());
8774       continue;
8775     }
8776     LRAvailableEverywhere &= LRU.available(AArch64::LR);
8777     RangeBegin = MI.getIterator();
8778     ++RangeLen;
8779   }
8780   // Above loop misses the last (or only) range. If we are still safe, then
8781   // let's save the range.
8782   if (AreAllUnsafeRegsDead())
8783     SaveRangeIfNonEmpty();
8784   if (Ranges.empty())
8785     return Ranges;
8786   // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8787   // the order.
8788   std::reverse(Ranges.begin(), Ranges.end());
8789   // If there is at least one outlinable range where LR is unavailable
8790   // somewhere, remember that.
8791   if (!LRAvailableEverywhere)
8792     Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8793   return Ranges;
8794 }
8795 
8796 outliner::InstrType
8797 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8798                                    unsigned Flags) const {
8799   MachineInstr &MI = *MIT;
8800   MachineBasicBlock *MBB = MI.getParent();
8801   MachineFunction *MF = MBB->getParent();
8802   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8803 
8804   // Don't outline anything used for return address signing. The outlined
8805   // function will get signed later if needed
8806   switch (MI.getOpcode()) {
8807   case AArch64::PACM:
8808   case AArch64::PACIASP:
8809   case AArch64::PACIBSP:
8810   case AArch64::PACIASPPC:
8811   case AArch64::PACIBSPPC:
8812   case AArch64::AUTIASP:
8813   case AArch64::AUTIBSP:
8814   case AArch64::AUTIASPPCi:
8815   case AArch64::AUTIASPPCr:
8816   case AArch64::AUTIBSPPCi:
8817   case AArch64::AUTIBSPPCr:
8818   case AArch64::RETAA:
8819   case AArch64::RETAB:
8820   case AArch64::RETAASPPCi:
8821   case AArch64::RETAASPPCr:
8822   case AArch64::RETABSPPCi:
8823   case AArch64::RETABSPPCr:
8824   case AArch64::EMITBKEY:
8825   case AArch64::PAUTH_PROLOGUE:
8826   case AArch64::PAUTH_EPILOGUE:
8827     return outliner::InstrType::Illegal;
8828   }
8829 
8830   // Don't outline LOHs.
8831   if (FuncInfo->getLOHRelated().count(&MI))
8832     return outliner::InstrType::Illegal;
8833 
8834   // We can only outline these if we will tail call the outlined function, or
8835   // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8836   // in a tail call.
8837   //
8838   // FIXME: If the proper fixups for the offset are implemented, this should be
8839   // possible.
8840   if (MI.isCFIInstruction())
8841     return outliner::InstrType::Legal;
8842 
8843   // Is this a terminator for a basic block?
8844   if (MI.isTerminator())
8845     // TargetInstrInfo::getOutliningType has already filtered out anything
8846     // that would break this, so we can allow it here.
8847     return outliner::InstrType::Legal;
8848 
8849   // Make sure none of the operands are un-outlinable.
8850   for (const MachineOperand &MOP : MI.operands()) {
8851     // A check preventing CFI indices was here before, but only CFI
8852     // instructions should have those.
8853     assert(!MOP.isCFIIndex());
8854 
8855     // If it uses LR or W30 explicitly, then don't touch it.
8856     if (MOP.isReg() && !MOP.isImplicit() &&
8857         (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8858       return outliner::InstrType::Illegal;
8859   }
8860 
8861   // Special cases for instructions that can always be outlined, but will fail
8862   // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8863   // be outlined because they don't require a *specific* value to be in LR.
8864   if (MI.getOpcode() == AArch64::ADRP)
8865     return outliner::InstrType::Legal;
8866 
8867   // If MI is a call we might be able to outline it. We don't want to outline
8868   // any calls that rely on the position of items on the stack. When we outline
8869   // something containing a call, we have to emit a save and restore of LR in
8870   // the outlined function. Currently, this always happens by saving LR to the
8871   // stack. Thus, if we outline, say, half the parameters for a function call
8872   // plus the call, then we'll break the callee's expectations for the layout
8873   // of the stack.
8874   //
8875   // FIXME: Allow calls to functions which construct a stack frame, as long
8876   // as they don't access arguments on the stack.
8877   // FIXME: Figure out some way to analyze functions defined in other modules.
8878   // We should be able to compute the memory usage based on the IR calling
8879   // convention, even if we can't see the definition.
8880   if (MI.isCall()) {
8881     // Get the function associated with the call. Look at each operand and find
8882     // the one that represents the callee and get its name.
8883     const Function *Callee = nullptr;
8884     for (const MachineOperand &MOP : MI.operands()) {
8885       if (MOP.isGlobal()) {
8886         Callee = dyn_cast<Function>(MOP.getGlobal());
8887         break;
8888       }
8889     }
8890 
8891     // Never outline calls to mcount.  There isn't any rule that would require
8892     // this, but the Linux kernel's "ftrace" feature depends on it.
8893     if (Callee && Callee->getName() == "\01_mcount")
8894       return outliner::InstrType::Illegal;
8895 
8896     // If we don't know anything about the callee, assume it depends on the
8897     // stack layout of the caller. In that case, it's only legal to outline
8898     // as a tail-call. Explicitly list the call instructions we know about so we
8899     // don't get unexpected results with call pseudo-instructions.
8900     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8901     if (MI.getOpcode() == AArch64::BLR ||
8902         MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8903       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8904 
8905     if (!Callee)
8906       return UnknownCallOutlineType;
8907 
8908     // We have a function we have information about. Check it if it's something
8909     // can safely outline.
8910     MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8911 
8912     // We don't know what's going on with the callee at all. Don't touch it.
8913     if (!CalleeMF)
8914       return UnknownCallOutlineType;
8915 
8916     // Check if we know anything about the callee saves on the function. If we
8917     // don't, then don't touch it, since that implies that we haven't
8918     // computed anything about its stack frame yet.
8919     MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8920     if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8921         MFI.getNumObjects() > 0)
8922       return UnknownCallOutlineType;
8923 
8924     // At this point, we can say that CalleeMF ought to not pass anything on the
8925     // stack. Therefore, we can outline it.
8926     return outliner::InstrType::Legal;
8927   }
8928 
8929   // Don't touch the link register or W30.
8930   if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8931       MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
8932     return outliner::InstrType::Illegal;
8933 
8934   // Don't outline BTI instructions, because that will prevent the outlining
8935   // site from being indirectly callable.
8936   if (hasBTISemantics(MI))
8937     return outliner::InstrType::Illegal;
8938 
8939   return outliner::InstrType::Legal;
8940 }
8941 
8942 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
8943   for (MachineInstr &MI : MBB) {
8944     const MachineOperand *Base;
8945     TypeSize Width(0, false);
8946     int64_t Offset;
8947     bool OffsetIsScalable;
8948 
8949     // Is this a load or store with an immediate offset with SP as the base?
8950     if (!MI.mayLoadOrStore() ||
8951         !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
8952                                       &RI) ||
8953         (Base->isReg() && Base->getReg() != AArch64::SP))
8954       continue;
8955 
8956     // It is, so we have to fix it up.
8957     TypeSize Scale(0U, false);
8958     int64_t Dummy1, Dummy2;
8959 
8960     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
8961     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
8962     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
8963     assert(Scale != 0 && "Unexpected opcode!");
8964     assert(!OffsetIsScalable && "Expected offset to be a byte offset");
8965 
8966     // We've pushed the return address to the stack, so add 16 to the offset.
8967     // This is safe, since we already checked if it would overflow when we
8968     // checked if this instruction was legal to outline.
8969     int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
8970     StackOffsetOperand.setImm(NewImm);
8971   }
8972 }
8973 
8974 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
8975                                  const AArch64InstrInfo *TII,
8976                                  bool ShouldSignReturnAddr) {
8977   if (!ShouldSignReturnAddr)
8978     return;
8979 
8980   BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
8981       .setMIFlag(MachineInstr::FrameSetup);
8982   BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
8983           TII->get(AArch64::PAUTH_EPILOGUE))
8984       .setMIFlag(MachineInstr::FrameDestroy);
8985 }
8986 
8987 void AArch64InstrInfo::buildOutlinedFrame(
8988     MachineBasicBlock &MBB, MachineFunction &MF,
8989     const outliner::OutlinedFunction &OF) const {
8990 
8991   AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
8992 
8993   if (OF.FrameConstructionID == MachineOutlinerTailCall)
8994     FI->setOutliningStyle("Tail Call");
8995   else if (OF.FrameConstructionID == MachineOutlinerThunk) {
8996     // For thunk outlining, rewrite the last instruction from a call to a
8997     // tail-call.
8998     MachineInstr *Call = &*--MBB.instr_end();
8999     unsigned TailOpcode;
9000     if (Call->getOpcode() == AArch64::BL) {
9001       TailOpcode = AArch64::TCRETURNdi;
9002     } else {
9003       assert(Call->getOpcode() == AArch64::BLR ||
9004              Call->getOpcode() == AArch64::BLRNoIP);
9005       TailOpcode = AArch64::TCRETURNriALL;
9006     }
9007     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9008                            .add(Call->getOperand(0))
9009                            .addImm(0);
9010     MBB.insert(MBB.end(), TC);
9011     Call->eraseFromParent();
9012 
9013     FI->setOutliningStyle("Thunk");
9014   }
9015 
9016   bool IsLeafFunction = true;
9017 
9018   // Is there a call in the outlined range?
9019   auto IsNonTailCall = [](const MachineInstr &MI) {
9020     return MI.isCall() && !MI.isReturn();
9021   };
9022 
9023   if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9024     // Fix up the instructions in the range, since we're going to modify the
9025     // stack.
9026 
9027     // Bugzilla ID: 46767
9028     // TODO: Check if fixing up twice is safe so we can outline these.
9029     assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9030            "Can only fix up stack references once");
9031     fixupPostOutline(MBB);
9032 
9033     IsLeafFunction = false;
9034 
9035     // LR has to be a live in so that we can save it.
9036     if (!MBB.isLiveIn(AArch64::LR))
9037       MBB.addLiveIn(AArch64::LR);
9038 
9039     MachineBasicBlock::iterator It = MBB.begin();
9040     MachineBasicBlock::iterator Et = MBB.end();
9041 
9042     if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9043         OF.FrameConstructionID == MachineOutlinerThunk)
9044       Et = std::prev(MBB.end());
9045 
9046     // Insert a save before the outlined region
9047     MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9048                                 .addReg(AArch64::SP, RegState::Define)
9049                                 .addReg(AArch64::LR)
9050                                 .addReg(AArch64::SP)
9051                                 .addImm(-16);
9052     It = MBB.insert(It, STRXpre);
9053 
9054     if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9055       const TargetSubtargetInfo &STI = MF.getSubtarget();
9056       const MCRegisterInfo *MRI = STI.getRegisterInfo();
9057       unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9058 
9059       // Add a CFI saying the stack was moved 16 B down.
9060       int64_t StackPosEntry =
9061           MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9062       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9063           .addCFIIndex(StackPosEntry)
9064           .setMIFlags(MachineInstr::FrameSetup);
9065 
9066       // Add a CFI saying that the LR that we want to find is now 16 B higher
9067       // than before.
9068       int64_t LRPosEntry = MF.addFrameInst(
9069           MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9070       BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9071           .addCFIIndex(LRPosEntry)
9072           .setMIFlags(MachineInstr::FrameSetup);
9073     }
9074 
9075     // Insert a restore before the terminator for the function.
9076     MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9077                                  .addReg(AArch64::SP, RegState::Define)
9078                                  .addReg(AArch64::LR, RegState::Define)
9079                                  .addReg(AArch64::SP)
9080                                  .addImm(16);
9081     Et = MBB.insert(Et, LDRXpost);
9082   }
9083 
9084   bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9085 
9086   // If this is a tail call outlined function, then there's already a return.
9087   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9088       OF.FrameConstructionID == MachineOutlinerThunk) {
9089     signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9090     return;
9091   }
9092 
9093   // It's not a tail call, so we have to insert the return ourselves.
9094 
9095   // LR has to be a live in so that we can return to it.
9096   if (!MBB.isLiveIn(AArch64::LR))
9097     MBB.addLiveIn(AArch64::LR);
9098 
9099   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9100                           .addReg(AArch64::LR);
9101   MBB.insert(MBB.end(), ret);
9102 
9103   signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9104 
9105   FI->setOutliningStyle("Function");
9106 
9107   // Did we have to modify the stack by saving the link register?
9108   if (OF.FrameConstructionID != MachineOutlinerDefault)
9109     return;
9110 
9111   // We modified the stack.
9112   // Walk over the basic block and fix up all the stack accesses.
9113   fixupPostOutline(MBB);
9114 }
9115 
9116 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9117     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9118     MachineFunction &MF, outliner::Candidate &C) const {
9119 
9120   // Are we tail calling?
9121   if (C.CallConstructionID == MachineOutlinerTailCall) {
9122     // If yes, then we can just branch to the label.
9123     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9124                             .addGlobalAddress(M.getNamedValue(MF.getName()))
9125                             .addImm(0));
9126     return It;
9127   }
9128 
9129   // Are we saving the link register?
9130   if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9131       C.CallConstructionID == MachineOutlinerThunk) {
9132     // No, so just insert the call.
9133     It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9134                             .addGlobalAddress(M.getNamedValue(MF.getName())));
9135     return It;
9136   }
9137 
9138   // We want to return the spot where we inserted the call.
9139   MachineBasicBlock::iterator CallPt;
9140 
9141   // Instructions for saving and restoring LR around the call instruction we're
9142   // going to insert.
9143   MachineInstr *Save;
9144   MachineInstr *Restore;
9145   // Can we save to a register?
9146   if (C.CallConstructionID == MachineOutlinerRegSave) {
9147     // FIXME: This logic should be sunk into a target-specific interface so that
9148     // we don't have to recompute the register.
9149     Register Reg = findRegisterToSaveLRTo(C);
9150     assert(Reg && "No callee-saved register available?");
9151 
9152     // LR has to be a live in so that we can save it.
9153     if (!MBB.isLiveIn(AArch64::LR))
9154       MBB.addLiveIn(AArch64::LR);
9155 
9156     // Save and restore LR from Reg.
9157     Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9158                .addReg(AArch64::XZR)
9159                .addReg(AArch64::LR)
9160                .addImm(0);
9161     Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9162                 .addReg(AArch64::XZR)
9163                 .addReg(Reg)
9164                 .addImm(0);
9165   } else {
9166     // We have the default case. Save and restore from SP.
9167     Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9168                .addReg(AArch64::SP, RegState::Define)
9169                .addReg(AArch64::LR)
9170                .addReg(AArch64::SP)
9171                .addImm(-16);
9172     Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9173                   .addReg(AArch64::SP, RegState::Define)
9174                   .addReg(AArch64::LR, RegState::Define)
9175                   .addReg(AArch64::SP)
9176                   .addImm(16);
9177   }
9178 
9179   It = MBB.insert(It, Save);
9180   It++;
9181 
9182   // Insert the call.
9183   It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9184                           .addGlobalAddress(M.getNamedValue(MF.getName())));
9185   CallPt = It;
9186   It++;
9187 
9188   It = MBB.insert(It, Restore);
9189   return CallPt;
9190 }
9191 
9192 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9193   MachineFunction &MF) const {
9194   return MF.getFunction().hasMinSize();
9195 }
9196 
9197 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9198                                           MachineBasicBlock::iterator Iter,
9199                                           DebugLoc &DL,
9200                                           bool AllowSideEffects) const {
9201   const MachineFunction &MF = *MBB.getParent();
9202   const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9203   const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9204 
9205   if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9206     BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9207   } else if (STI.hasSVE()) {
9208     BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9209       .addImm(0)
9210       .addImm(0);
9211   } else {
9212     BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9213       .addImm(0);
9214   }
9215 }
9216 
9217 std::optional<DestSourcePair>
9218 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9219 
9220   // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9221   // and zero immediate operands used as an alias for mov instruction.
9222   if (MI.getOpcode() == AArch64::ORRWrs &&
9223       MI.getOperand(1).getReg() == AArch64::WZR &&
9224       MI.getOperand(3).getImm() == 0x0 &&
9225       // Check that the w->w move is not a zero-extending w->x mov.
9226       (!MI.getOperand(0).getReg().isVirtual() ||
9227        MI.getOperand(0).getSubReg() == 0) &&
9228       (!MI.getOperand(0).getReg().isPhysical() ||
9229        MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9230                                     AArch64::X0) == -1))
9231     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9232 
9233   if (MI.getOpcode() == AArch64::ORRXrs &&
9234       MI.getOperand(1).getReg() == AArch64::XZR &&
9235       MI.getOperand(3).getImm() == 0x0)
9236     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9237 
9238   return std::nullopt;
9239 }
9240 
9241 std::optional<DestSourcePair>
9242 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9243   if (MI.getOpcode() == AArch64::ORRWrs &&
9244       MI.getOperand(1).getReg() == AArch64::WZR &&
9245       MI.getOperand(3).getImm() == 0x0)
9246     return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9247   return std::nullopt;
9248 }
9249 
9250 std::optional<RegImmPair>
9251 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9252   int Sign = 1;
9253   int64_t Offset = 0;
9254 
9255   // TODO: Handle cases where Reg is a super- or sub-register of the
9256   // destination register.
9257   const MachineOperand &Op0 = MI.getOperand(0);
9258   if (!Op0.isReg() || Reg != Op0.getReg())
9259     return std::nullopt;
9260 
9261   switch (MI.getOpcode()) {
9262   default:
9263     return std::nullopt;
9264   case AArch64::SUBWri:
9265   case AArch64::SUBXri:
9266   case AArch64::SUBSWri:
9267   case AArch64::SUBSXri:
9268     Sign *= -1;
9269     [[fallthrough]];
9270   case AArch64::ADDSWri:
9271   case AArch64::ADDSXri:
9272   case AArch64::ADDWri:
9273   case AArch64::ADDXri: {
9274     // TODO: Third operand can be global address (usually some string).
9275     if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9276         !MI.getOperand(2).isImm())
9277       return std::nullopt;
9278     int Shift = MI.getOperand(3).getImm();
9279     assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9280     Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9281   }
9282   }
9283   return RegImmPair{MI.getOperand(1).getReg(), Offset};
9284 }
9285 
9286 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9287 /// the destination register then, if possible, describe the value in terms of
9288 /// the source register.
9289 static std::optional<ParamLoadedValue>
9290 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9291                        const TargetInstrInfo *TII,
9292                        const TargetRegisterInfo *TRI) {
9293   auto DestSrc = TII->isCopyLikeInstr(MI);
9294   if (!DestSrc)
9295     return std::nullopt;
9296 
9297   Register DestReg = DestSrc->Destination->getReg();
9298   Register SrcReg = DestSrc->Source->getReg();
9299 
9300   auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9301 
9302   // If the described register is the destination, just return the source.
9303   if (DestReg == DescribedReg)
9304     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9305 
9306   // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9307   if (MI.getOpcode() == AArch64::ORRWrs &&
9308       TRI->isSuperRegister(DestReg, DescribedReg))
9309     return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9310 
9311   // We may need to describe the lower part of a ORRXrs move.
9312   if (MI.getOpcode() == AArch64::ORRXrs &&
9313       TRI->isSubRegister(DestReg, DescribedReg)) {
9314     Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9315     return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9316   }
9317 
9318   assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9319          "Unhandled ORR[XW]rs copy case");
9320 
9321   return std::nullopt;
9322 }
9323 
9324 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9325   // Functions cannot be split to different sections on AArch64 if they have
9326   // a red zone. This is because relaxing a cross-section branch may require
9327   // incrementing the stack pointer to spill a register, which would overwrite
9328   // the red zone.
9329   if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9330     return false;
9331 
9332   return TargetInstrInfo::isFunctionSafeToSplit(MF);
9333 }
9334 
9335 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9336     const MachineBasicBlock &MBB) const {
9337   // Asm Goto blocks can contain conditional branches to goto labels, which can
9338   // get moved out of range of the branch instruction.
9339   auto isAsmGoto = [](const MachineInstr &MI) {
9340     return MI.getOpcode() == AArch64::INLINEASM_BR;
9341   };
9342   if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9343     return false;
9344 
9345   // Because jump tables are label-relative instead of table-relative, they all
9346   // must be in the same section or relocation fixup handling will fail.
9347 
9348   // Check if MBB is a jump table target
9349   const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9350   auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9351     return llvm::is_contained(JTE.MBBs, &MBB);
9352   };
9353   if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9354     return false;
9355 
9356   // Check if MBB contains a jump table lookup
9357   for (const MachineInstr &MI : MBB) {
9358     switch (MI.getOpcode()) {
9359     case TargetOpcode::G_BRJT:
9360     case AArch64::JumpTableDest32:
9361     case AArch64::JumpTableDest16:
9362     case AArch64::JumpTableDest8:
9363       return false;
9364     default:
9365       continue;
9366     }
9367   }
9368 
9369   // MBB isn't a special case, so it's safe to be split to the cold section.
9370   return true;
9371 }
9372 
9373 std::optional<ParamLoadedValue>
9374 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9375                                       Register Reg) const {
9376   const MachineFunction *MF = MI.getMF();
9377   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9378   switch (MI.getOpcode()) {
9379   case AArch64::MOVZWi:
9380   case AArch64::MOVZXi: {
9381     // MOVZWi may be used for producing zero-extended 32-bit immediates in
9382     // 64-bit parameters, so we need to consider super-registers.
9383     if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9384       return std::nullopt;
9385 
9386     if (!MI.getOperand(1).isImm())
9387       return std::nullopt;
9388     int64_t Immediate = MI.getOperand(1).getImm();
9389     int Shift = MI.getOperand(2).getImm();
9390     return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9391                             nullptr);
9392   }
9393   case AArch64::ORRWrs:
9394   case AArch64::ORRXrs:
9395     return describeORRLoadedValue(MI, Reg, this, TRI);
9396   }
9397 
9398   return TargetInstrInfo::describeLoadedValue(MI, Reg);
9399 }
9400 
9401 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9402     MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9403   assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9404          ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9405          ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9406 
9407   // Anyexts are nops.
9408   if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9409     return true;
9410 
9411   Register DefReg = ExtMI.getOperand(0).getReg();
9412   if (!MRI.hasOneNonDBGUse(DefReg))
9413     return false;
9414 
9415   // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9416   // addressing mode.
9417   auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9418   return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9419 }
9420 
9421 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9422   return get(Opc).TSFlags & AArch64::ElementSizeMask;
9423 }
9424 
9425 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9426   return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9427 }
9428 
9429 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9430   return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9431 }
9432 
9433 unsigned int
9434 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9435   return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9436 }
9437 
9438 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9439                                              unsigned Scale) const {
9440   if (Offset && Scale)
9441     return false;
9442 
9443   // Check Reg + Imm
9444   if (!Scale) {
9445     // 9-bit signed offset
9446     if (isInt<9>(Offset))
9447       return true;
9448 
9449     // 12-bit unsigned offset
9450     unsigned Shift = Log2_64(NumBytes);
9451     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9452         // Must be a multiple of NumBytes (NumBytes is a power of 2)
9453         (Offset >> Shift) << Shift == Offset)
9454       return true;
9455     return false;
9456   }
9457 
9458   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9459   return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9460 }
9461 
9462 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9463   if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9464     return AArch64::BLRNoIP;
9465   else
9466     return AArch64::BLR;
9467 }
9468 
9469 bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
9470     const MachineInstr &MI) const {
9471   const MachineFunction &MF = *MI.getMF();
9472   const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
9473 
9474   // If the function contains changes to streaming mode, then there
9475   // is a danger that rematerialised instructions end up between
9476   // instruction sequences (e.g. call sequences, or prolog/epilogue)
9477   // where the streaming-SVE mode is temporarily changed.
9478   if (AFI.hasStreamingModeChanges()) {
9479     // Avoid rematerializing rematerializable instructions that use/define
9480     // scalable values, such as 'pfalse' or 'ptrue', which result in different
9481     // results when the runtime vector length is different.
9482     const MachineRegisterInfo &MRI = MF.getRegInfo();
9483     const MachineFrameInfo &MFI = MF.getFrameInfo();
9484     if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) {
9485           if (MO.isFI() &&
9486               MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector)
9487             return true;
9488           if (!MO.isReg())
9489             return false;
9490 
9491           if (MO.getReg().isVirtual()) {
9492             const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
9493             return AArch64::ZPRRegClass.hasSubClassEq(RC) ||
9494                    AArch64::PPRRegClass.hasSubClassEq(RC);
9495           }
9496           return AArch64::ZPRRegClass.contains(MO.getReg()) ||
9497                  AArch64::PPRRegClass.contains(MO.getReg());
9498         }))
9499       return false;
9500 
9501     // Avoid rematerializing instructions that return a value that is
9502     // different depending on vector length, even when it is not returned
9503     // in a scalable vector/predicate register.
9504     switch (MI.getOpcode()) {
9505     default:
9506       break;
9507     case AArch64::RDVLI_XI:
9508     case AArch64::ADDVL_XXI:
9509     case AArch64::ADDPL_XXI:
9510     case AArch64::CNTB_XPiI:
9511     case AArch64::CNTH_XPiI:
9512     case AArch64::CNTW_XPiI:
9513     case AArch64::CNTD_XPiI:
9514       return false;
9515     }
9516   }
9517 
9518   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
9519 }
9520 
9521 MachineBasicBlock::iterator
9522 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9523                                    Register TargetReg, bool FrameSetup) const {
9524   assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9525 
9526   MachineBasicBlock &MBB = *MBBI->getParent();
9527   MachineFunction &MF = *MBB.getParent();
9528   const AArch64InstrInfo *TII =
9529       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9530   int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9531   DebugLoc DL = MBB.findDebugLoc(MBBI);
9532 
9533   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9534   MachineBasicBlock *LoopTestMBB =
9535       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9536   MF.insert(MBBInsertPoint, LoopTestMBB);
9537   MachineBasicBlock *LoopBodyMBB =
9538       MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9539   MF.insert(MBBInsertPoint, LoopBodyMBB);
9540   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9541   MF.insert(MBBInsertPoint, ExitMBB);
9542   MachineInstr::MIFlag Flags =
9543       FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9544 
9545   // LoopTest:
9546   //   SUB SP, SP, #ProbeSize
9547   emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9548                   AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9549 
9550   //   CMP SP, TargetReg
9551   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9552           AArch64::XZR)
9553       .addReg(AArch64::SP)
9554       .addReg(TargetReg)
9555       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9556       .setMIFlags(Flags);
9557 
9558   //   B.<Cond> LoopExit
9559   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9560       .addImm(AArch64CC::LE)
9561       .addMBB(ExitMBB)
9562       .setMIFlags(Flags);
9563 
9564   //   STR XZR, [SP]
9565   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9566       .addReg(AArch64::XZR)
9567       .addReg(AArch64::SP)
9568       .addImm(0)
9569       .setMIFlags(Flags);
9570 
9571   //   B loop
9572   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9573       .addMBB(LoopTestMBB)
9574       .setMIFlags(Flags);
9575 
9576   // LoopExit:
9577   //   MOV SP, TargetReg
9578   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9579       .addReg(TargetReg)
9580       .addImm(0)
9581       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9582       .setMIFlags(Flags);
9583 
9584   //   LDR XZR, [SP]
9585   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9586       .addReg(AArch64::XZR, RegState::Define)
9587       .addReg(AArch64::SP)
9588       .addImm(0)
9589       .setMIFlags(Flags);
9590 
9591   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9592   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9593 
9594   LoopTestMBB->addSuccessor(ExitMBB);
9595   LoopTestMBB->addSuccessor(LoopBodyMBB);
9596   LoopBodyMBB->addSuccessor(LoopTestMBB);
9597   MBB.addSuccessor(LoopTestMBB);
9598 
9599   // Update liveins.
9600   if (MF.getRegInfo().reservedRegsFrozen()) {
9601     recomputeLiveIns(*LoopTestMBB);
9602     recomputeLiveIns(*LoopBodyMBB);
9603     recomputeLiveIns(*ExitMBB);
9604   }
9605 
9606   return ExitMBB->begin();
9607 }
9608 
9609 #define GET_INSTRINFO_HELPERS
9610 #define GET_INSTRMAP_INFO
9611 #include "AArch64GenInstrInfo.inc"
9612