1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64MachineFunctionInfo.h"
16 #include "AArch64PointerAuth.h"
17 #include "AArch64Subtarget.h"
18 #include "MCTargetDesc/AArch64AddressingModes.h"
19 #include "MCTargetDesc/AArch64MCTargetDesc.h"
20 #include "Utils/AArch64BaseInfo.h"
21 #include "llvm/ADT/ArrayRef.h"
22 #include "llvm/ADT/STLExtras.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/CodeGen/CFIInstBuilder.h"
25 #include "llvm/CodeGen/LivePhysRegs.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineCombinerPattern.h"
28 #include "llvm/CodeGen/MachineFrameInfo.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineMemOperand.h"
33 #include "llvm/CodeGen/MachineModuleInfo.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/RegisterScavenging.h"
37 #include "llvm/CodeGen/StackMaps.h"
38 #include "llvm/CodeGen/TargetRegisterInfo.h"
39 #include "llvm/CodeGen/TargetSubtargetInfo.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/DebugLoc.h"
42 #include "llvm/IR/GlobalValue.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/MC/MCAsmInfo.h"
45 #include "llvm/MC/MCInst.h"
46 #include "llvm/MC/MCInstBuilder.h"
47 #include "llvm/MC/MCInstrDesc.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CodeGen.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/LEB128.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <utility>
60
61 using namespace llvm;
62
63 #define GET_INSTRINFO_CTOR_DTOR
64 #include "AArch64GenInstrInfo.inc"
65
66 static cl::opt<unsigned>
67 CBDisplacementBits("aarch64-cb-offset-bits", cl::Hidden, cl::init(9),
68 cl::desc("Restrict range of CB instructions (DEBUG)"));
69
70 static cl::opt<unsigned> TBZDisplacementBits(
71 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
72 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
73
74 static cl::opt<unsigned> CBZDisplacementBits(
75 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
76 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
77
78 static cl::opt<unsigned>
79 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
80 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
81
82 static cl::opt<unsigned>
83 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
84 cl::desc("Restrict range of B instructions (DEBUG)"));
85
AArch64InstrInfo(const AArch64Subtarget & STI)86 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
87 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
88 AArch64::CATCHRET),
89 RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
90
91 /// GetInstSize - Return the number of bytes of code the specified
92 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const93 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
94 const MachineBasicBlock &MBB = *MI.getParent();
95 const MachineFunction *MF = MBB.getParent();
96 const Function &F = MF->getFunction();
97 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
98
99 {
100 auto Op = MI.getOpcode();
101 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
102 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
103 }
104
105 // Meta-instructions emit no code.
106 if (MI.isMetaInstruction())
107 return 0;
108
109 // FIXME: We currently only handle pseudoinstructions that don't get expanded
110 // before the assembly printer.
111 unsigned NumBytes = 0;
112 const MCInstrDesc &Desc = MI.getDesc();
113
114 if (!MI.isBundle() && isTailCallReturnInst(MI)) {
115 NumBytes = Desc.getSize() ? Desc.getSize() : 4;
116
117 const auto *MFI = MF->getInfo<AArch64FunctionInfo>();
118 if (!MFI->shouldSignReturnAddress(MF))
119 return NumBytes;
120
121 const auto &STI = MF->getSubtarget<AArch64Subtarget>();
122 auto Method = STI.getAuthenticatedLRCheckMethod(*MF);
123 NumBytes += AArch64PAuth::getCheckerSizeInBytes(Method);
124 return NumBytes;
125 }
126
127 // Size should be preferably set in
128 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
129 // Specific cases handle instructions of variable sizes
130 switch (Desc.getOpcode()) {
131 default:
132 if (Desc.getSize())
133 return Desc.getSize();
134
135 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
136 // with fixed constant size but not specified in .td file) is a normal
137 // 4-byte insn.
138 NumBytes = 4;
139 break;
140 case TargetOpcode::STACKMAP:
141 // The upper bound for a stackmap intrinsic is the full length of its shadow
142 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
143 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
144 break;
145 case TargetOpcode::PATCHPOINT:
146 // The size of the patchpoint intrinsic is the number of bytes requested
147 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
148 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
149 break;
150 case TargetOpcode::STATEPOINT:
151 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
152 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
153 // No patch bytes means a normal call inst is emitted
154 if (NumBytes == 0)
155 NumBytes = 4;
156 break;
157 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
158 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
159 // instructions are expanded to the specified number of NOPs. Otherwise,
160 // they are expanded to 36-byte XRay sleds.
161 NumBytes =
162 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
163 break;
164 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
165 case TargetOpcode::PATCHABLE_TAIL_CALL:
166 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
167 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
168 NumBytes = 36;
169 break;
170 case TargetOpcode::PATCHABLE_EVENT_CALL:
171 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
172 NumBytes = 24;
173 break;
174
175 case AArch64::SPACE:
176 NumBytes = MI.getOperand(1).getImm();
177 break;
178 case TargetOpcode::BUNDLE:
179 NumBytes = getInstBundleLength(MI);
180 break;
181 }
182
183 return NumBytes;
184 }
185
getInstBundleLength(const MachineInstr & MI) const186 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
187 unsigned Size = 0;
188 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
189 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
190 while (++I != E && I->isInsideBundle()) {
191 assert(!I->isBundle() && "No nested bundle!");
192 Size += getInstSizeInBytes(*I);
193 }
194 return Size;
195 }
196
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)197 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
198 SmallVectorImpl<MachineOperand> &Cond) {
199 // Block ends with fall-through condbranch.
200 switch (LastInst->getOpcode()) {
201 default:
202 llvm_unreachable("Unknown branch instruction?");
203 case AArch64::Bcc:
204 Target = LastInst->getOperand(1).getMBB();
205 Cond.push_back(LastInst->getOperand(0));
206 break;
207 case AArch64::CBZW:
208 case AArch64::CBZX:
209 case AArch64::CBNZW:
210 case AArch64::CBNZX:
211 Target = LastInst->getOperand(1).getMBB();
212 Cond.push_back(MachineOperand::CreateImm(-1));
213 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
214 Cond.push_back(LastInst->getOperand(0));
215 break;
216 case AArch64::TBZW:
217 case AArch64::TBZX:
218 case AArch64::TBNZW:
219 case AArch64::TBNZX:
220 Target = LastInst->getOperand(2).getMBB();
221 Cond.push_back(MachineOperand::CreateImm(-1));
222 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
223 Cond.push_back(LastInst->getOperand(0));
224 Cond.push_back(LastInst->getOperand(1));
225 break;
226 case AArch64::CBWPri:
227 case AArch64::CBXPri:
228 case AArch64::CBWPrr:
229 case AArch64::CBXPrr:
230 Target = LastInst->getOperand(3).getMBB();
231 Cond.push_back(MachineOperand::CreateImm(-1));
232 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
233 Cond.push_back(LastInst->getOperand(0));
234 Cond.push_back(LastInst->getOperand(1));
235 Cond.push_back(LastInst->getOperand(2));
236 break;
237 }
238 }
239
getBranchDisplacementBits(unsigned Opc)240 static unsigned getBranchDisplacementBits(unsigned Opc) {
241 switch (Opc) {
242 default:
243 llvm_unreachable("unexpected opcode!");
244 case AArch64::B:
245 return BDisplacementBits;
246 case AArch64::TBNZW:
247 case AArch64::TBZW:
248 case AArch64::TBNZX:
249 case AArch64::TBZX:
250 return TBZDisplacementBits;
251 case AArch64::CBNZW:
252 case AArch64::CBZW:
253 case AArch64::CBNZX:
254 case AArch64::CBZX:
255 return CBZDisplacementBits;
256 case AArch64::Bcc:
257 return BCCDisplacementBits;
258 case AArch64::CBWPri:
259 case AArch64::CBXPri:
260 case AArch64::CBWPrr:
261 case AArch64::CBXPrr:
262 return CBDisplacementBits;
263 }
264 }
265
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const266 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
267 int64_t BrOffset) const {
268 unsigned Bits = getBranchDisplacementBits(BranchOp);
269 assert(Bits >= 3 && "max branch displacement must be enough to jump"
270 "over conditional branch expansion");
271 return isIntN(Bits, BrOffset / 4);
272 }
273
274 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const275 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
276 switch (MI.getOpcode()) {
277 default:
278 llvm_unreachable("unexpected opcode!");
279 case AArch64::B:
280 return MI.getOperand(0).getMBB();
281 case AArch64::TBZW:
282 case AArch64::TBNZW:
283 case AArch64::TBZX:
284 case AArch64::TBNZX:
285 return MI.getOperand(2).getMBB();
286 case AArch64::CBZW:
287 case AArch64::CBNZW:
288 case AArch64::CBZX:
289 case AArch64::CBNZX:
290 case AArch64::Bcc:
291 return MI.getOperand(1).getMBB();
292 case AArch64::CBWPri:
293 case AArch64::CBXPri:
294 case AArch64::CBWPrr:
295 case AArch64::CBXPrr:
296 return MI.getOperand(3).getMBB();
297 }
298 }
299
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const300 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
301 MachineBasicBlock &NewDestBB,
302 MachineBasicBlock &RestoreBB,
303 const DebugLoc &DL,
304 int64_t BrOffset,
305 RegScavenger *RS) const {
306 assert(RS && "RegScavenger required for long branching");
307 assert(MBB.empty() &&
308 "new block should be inserted for expanding unconditional branch");
309 assert(MBB.pred_size() == 1);
310 assert(RestoreBB.empty() &&
311 "restore block should be inserted for restoring clobbered registers");
312
313 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
314 // Offsets outside of the signed 33-bit range are not supported for ADRP +
315 // ADD.
316 if (!isInt<33>(BrOffset))
317 report_fatal_error(
318 "Branch offsets outside of the signed 33-bit range not supported");
319
320 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
321 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
322 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
323 .addReg(Reg)
324 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
325 .addImm(0);
326 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
327 };
328
329 RS->enterBasicBlockEnd(MBB);
330 // If X16 is unused, we can rely on the linker to insert a range extension
331 // thunk if NewDestBB is out of range of a single B instruction.
332 constexpr Register Reg = AArch64::X16;
333 if (!RS->isRegUsed(Reg)) {
334 insertUnconditionalBranch(MBB, &NewDestBB, DL);
335 RS->setRegUsed(Reg);
336 return;
337 }
338
339 // If there's a free register and it's worth inflating the code size,
340 // manually insert the indirect branch.
341 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
342 if (Scavenged != AArch64::NoRegister &&
343 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
344 buildIndirectBranch(Scavenged, NewDestBB);
345 RS->setRegUsed(Scavenged);
346 return;
347 }
348
349 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
350 // with red zones.
351 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
352 if (!AFI || AFI->hasRedZone().value_or(true))
353 report_fatal_error(
354 "Unable to insert indirect branch inside function that has red zone");
355
356 // Otherwise, spill X16 and defer range extension to the linker.
357 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
358 .addReg(AArch64::SP, RegState::Define)
359 .addReg(Reg)
360 .addReg(AArch64::SP)
361 .addImm(-16);
362
363 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
364
365 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
366 .addReg(AArch64::SP, RegState::Define)
367 .addReg(Reg, RegState::Define)
368 .addReg(AArch64::SP)
369 .addImm(16);
370 }
371
372 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const373 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
374 MachineBasicBlock *&TBB,
375 MachineBasicBlock *&FBB,
376 SmallVectorImpl<MachineOperand> &Cond,
377 bool AllowModify) const {
378 // If the block has no terminators, it just falls into the block after it.
379 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
380 if (I == MBB.end())
381 return false;
382
383 // Skip over SpeculationBarrierEndBB terminators
384 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
385 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
386 --I;
387 }
388
389 if (!isUnpredicatedTerminator(*I))
390 return false;
391
392 // Get the last instruction in the block.
393 MachineInstr *LastInst = &*I;
394
395 // If there is only one terminator instruction, process it.
396 unsigned LastOpc = LastInst->getOpcode();
397 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
398 if (isUncondBranchOpcode(LastOpc)) {
399 TBB = LastInst->getOperand(0).getMBB();
400 return false;
401 }
402 if (isCondBranchOpcode(LastOpc)) {
403 // Block ends with fall-through condbranch.
404 parseCondBranch(LastInst, TBB, Cond);
405 return false;
406 }
407 return true; // Can't handle indirect branch.
408 }
409
410 // Get the instruction before it if it is a terminator.
411 MachineInstr *SecondLastInst = &*I;
412 unsigned SecondLastOpc = SecondLastInst->getOpcode();
413
414 // If AllowModify is true and the block ends with two or more unconditional
415 // branches, delete all but the first unconditional branch.
416 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
417 while (isUncondBranchOpcode(SecondLastOpc)) {
418 LastInst->eraseFromParent();
419 LastInst = SecondLastInst;
420 LastOpc = LastInst->getOpcode();
421 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
422 // Return now the only terminator is an unconditional branch.
423 TBB = LastInst->getOperand(0).getMBB();
424 return false;
425 }
426 SecondLastInst = &*I;
427 SecondLastOpc = SecondLastInst->getOpcode();
428 }
429 }
430
431 // If we're allowed to modify and the block ends in a unconditional branch
432 // which could simply fallthrough, remove the branch. (Note: This case only
433 // matters when we can't understand the whole sequence, otherwise it's also
434 // handled by BranchFolding.cpp.)
435 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
436 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
437 LastInst->eraseFromParent();
438 LastInst = SecondLastInst;
439 LastOpc = LastInst->getOpcode();
440 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
441 assert(!isUncondBranchOpcode(LastOpc) &&
442 "unreachable unconditional branches removed above");
443
444 if (isCondBranchOpcode(LastOpc)) {
445 // Block ends with fall-through condbranch.
446 parseCondBranch(LastInst, TBB, Cond);
447 return false;
448 }
449 return true; // Can't handle indirect branch.
450 }
451 SecondLastInst = &*I;
452 SecondLastOpc = SecondLastInst->getOpcode();
453 }
454
455 // If there are three terminators, we don't know what sort of block this is.
456 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
457 return true;
458
459 // If the block ends with a B and a Bcc, handle it.
460 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
461 parseCondBranch(SecondLastInst, TBB, Cond);
462 FBB = LastInst->getOperand(0).getMBB();
463 return false;
464 }
465
466 // If the block ends with two unconditional branches, handle it. The second
467 // one is not executed, so remove it.
468 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
469 TBB = SecondLastInst->getOperand(0).getMBB();
470 I = LastInst;
471 if (AllowModify)
472 I->eraseFromParent();
473 return false;
474 }
475
476 // ...likewise if it ends with an indirect branch followed by an unconditional
477 // branch.
478 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
479 I = LastInst;
480 if (AllowModify)
481 I->eraseFromParent();
482 return true;
483 }
484
485 // Otherwise, can't handle this.
486 return true;
487 }
488
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const489 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
490 MachineBranchPredicate &MBP,
491 bool AllowModify) const {
492 // For the moment, handle only a block which ends with a cb(n)zx followed by
493 // a fallthrough. Why this? Because it is a common form.
494 // TODO: Should we handle b.cc?
495
496 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
497 if (I == MBB.end())
498 return true;
499
500 // Skip over SpeculationBarrierEndBB terminators
501 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
502 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
503 --I;
504 }
505
506 if (!isUnpredicatedTerminator(*I))
507 return true;
508
509 // Get the last instruction in the block.
510 MachineInstr *LastInst = &*I;
511 unsigned LastOpc = LastInst->getOpcode();
512 if (!isCondBranchOpcode(LastOpc))
513 return true;
514
515 switch (LastOpc) {
516 default:
517 return true;
518 case AArch64::CBZW:
519 case AArch64::CBZX:
520 case AArch64::CBNZW:
521 case AArch64::CBNZX:
522 break;
523 };
524
525 MBP.TrueDest = LastInst->getOperand(1).getMBB();
526 assert(MBP.TrueDest && "expected!");
527 MBP.FalseDest = MBB.getNextNode();
528
529 MBP.ConditionDef = nullptr;
530 MBP.SingleUseCondition = false;
531
532 MBP.LHS = LastInst->getOperand(0);
533 MBP.RHS = MachineOperand::CreateImm(0);
534 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
535 : MachineBranchPredicate::PRED_EQ;
536 return false;
537 }
538
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const539 bool AArch64InstrInfo::reverseBranchCondition(
540 SmallVectorImpl<MachineOperand> &Cond) const {
541 if (Cond[0].getImm() != -1) {
542 // Regular Bcc
543 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
544 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
545 } else {
546 // Folded compare-and-branch
547 switch (Cond[1].getImm()) {
548 default:
549 llvm_unreachable("Unknown conditional branch!");
550 case AArch64::CBZW:
551 Cond[1].setImm(AArch64::CBNZW);
552 break;
553 case AArch64::CBNZW:
554 Cond[1].setImm(AArch64::CBZW);
555 break;
556 case AArch64::CBZX:
557 Cond[1].setImm(AArch64::CBNZX);
558 break;
559 case AArch64::CBNZX:
560 Cond[1].setImm(AArch64::CBZX);
561 break;
562 case AArch64::TBZW:
563 Cond[1].setImm(AArch64::TBNZW);
564 break;
565 case AArch64::TBNZW:
566 Cond[1].setImm(AArch64::TBZW);
567 break;
568 case AArch64::TBZX:
569 Cond[1].setImm(AArch64::TBNZX);
570 break;
571 case AArch64::TBNZX:
572 Cond[1].setImm(AArch64::TBZX);
573 break;
574
575 // Cond is { -1, Opcode, CC, Op0, Op1 }
576 case AArch64::CBWPri:
577 case AArch64::CBXPri:
578 case AArch64::CBWPrr:
579 case AArch64::CBXPrr: {
580 // Pseudos using standard 4bit Arm condition codes
581 AArch64CC::CondCode CC =
582 static_cast<AArch64CC::CondCode>(Cond[2].getImm());
583 Cond[2].setImm(AArch64CC::getInvertedCondCode(CC));
584 }
585 }
586 }
587
588 return false;
589 }
590
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const591 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
592 int *BytesRemoved) const {
593 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
594 if (I == MBB.end())
595 return 0;
596
597 if (!isUncondBranchOpcode(I->getOpcode()) &&
598 !isCondBranchOpcode(I->getOpcode()))
599 return 0;
600
601 // Remove the branch.
602 I->eraseFromParent();
603
604 I = MBB.end();
605
606 if (I == MBB.begin()) {
607 if (BytesRemoved)
608 *BytesRemoved = 4;
609 return 1;
610 }
611 --I;
612 if (!isCondBranchOpcode(I->getOpcode())) {
613 if (BytesRemoved)
614 *BytesRemoved = 4;
615 return 1;
616 }
617
618 // Remove the branch.
619 I->eraseFromParent();
620 if (BytesRemoved)
621 *BytesRemoved = 8;
622
623 return 2;
624 }
625
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const626 void AArch64InstrInfo::instantiateCondBranch(
627 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
628 ArrayRef<MachineOperand> Cond) const {
629 if (Cond[0].getImm() != -1) {
630 // Regular Bcc
631 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
632 } else {
633 // Folded compare-and-branch
634 // Note that we use addOperand instead of addReg to keep the flags.
635
636 // cbz, cbnz
637 const MachineInstrBuilder MIB =
638 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
639
640 // tbz/tbnz
641 if (Cond.size() > 3)
642 MIB.add(Cond[3]);
643
644 // cb
645 if (Cond.size() > 4)
646 MIB.add(Cond[4]);
647
648 MIB.addMBB(TBB);
649 }
650 }
651
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const652 unsigned AArch64InstrInfo::insertBranch(
653 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
654 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
655 // Shouldn't be a fall through.
656 assert(TBB && "insertBranch must not be told to insert a fallthrough");
657
658 if (!FBB) {
659 if (Cond.empty()) // Unconditional branch?
660 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
661 else
662 instantiateCondBranch(MBB, DL, TBB, Cond);
663
664 if (BytesAdded)
665 *BytesAdded = 4;
666
667 return 1;
668 }
669
670 // Two-way conditional branch.
671 instantiateCondBranch(MBB, DL, TBB, Cond);
672 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
673
674 if (BytesAdded)
675 *BytesAdded = 8;
676
677 return 2;
678 }
679
680 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)681 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
682 while (Register::isVirtualRegister(VReg)) {
683 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
684 if (!DefMI->isFullCopy())
685 return VReg;
686 VReg = DefMI->getOperand(1).getReg();
687 }
688 return VReg;
689 }
690
691 // Determine if VReg is defined by an instruction that can be folded into a
692 // csel instruction. If so, return the folded opcode, and the replacement
693 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)694 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
695 unsigned *NewVReg = nullptr) {
696 VReg = removeCopies(MRI, VReg);
697 if (!Register::isVirtualRegister(VReg))
698 return 0;
699
700 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
701 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
702 unsigned Opc = 0;
703 unsigned SrcOpNum = 0;
704 switch (DefMI->getOpcode()) {
705 case AArch64::ADDSXri:
706 case AArch64::ADDSWri:
707 // if NZCV is used, do not fold.
708 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
709 true) == -1)
710 return 0;
711 // fall-through to ADDXri and ADDWri.
712 [[fallthrough]];
713 case AArch64::ADDXri:
714 case AArch64::ADDWri:
715 // add x, 1 -> csinc.
716 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
717 DefMI->getOperand(3).getImm() != 0)
718 return 0;
719 SrcOpNum = 1;
720 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
721 break;
722
723 case AArch64::ORNXrr:
724 case AArch64::ORNWrr: {
725 // not x -> csinv, represented as orn dst, xzr, src.
726 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
727 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
728 return 0;
729 SrcOpNum = 2;
730 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
731 break;
732 }
733
734 case AArch64::SUBSXrr:
735 case AArch64::SUBSWrr:
736 // if NZCV is used, do not fold.
737 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
738 true) == -1)
739 return 0;
740 // fall-through to SUBXrr and SUBWrr.
741 [[fallthrough]];
742 case AArch64::SUBXrr:
743 case AArch64::SUBWrr: {
744 // neg x -> csneg, represented as sub dst, xzr, src.
745 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
746 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
747 return 0;
748 SrcOpNum = 2;
749 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
750 break;
751 }
752 default:
753 return 0;
754 }
755 assert(Opc && SrcOpNum && "Missing parameters");
756
757 if (NewVReg)
758 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
759 return Opc;
760 }
761
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const762 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
763 ArrayRef<MachineOperand> Cond,
764 Register DstReg, Register TrueReg,
765 Register FalseReg, int &CondCycles,
766 int &TrueCycles,
767 int &FalseCycles) const {
768 // Check register classes.
769 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
770 const TargetRegisterClass *RC =
771 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
772 if (!RC)
773 return false;
774
775 // Also need to check the dest regclass, in case we're trying to optimize
776 // something like:
777 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
778 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
779 return false;
780
781 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
782 unsigned ExtraCondLat = Cond.size() != 1;
783
784 // GPRs are handled by csel.
785 // FIXME: Fold in x+1, -x, and ~x when applicable.
786 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
787 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
788 // Single-cycle csel, csinc, csinv, and csneg.
789 CondCycles = 1 + ExtraCondLat;
790 TrueCycles = FalseCycles = 1;
791 if (canFoldIntoCSel(MRI, TrueReg))
792 TrueCycles = 0;
793 else if (canFoldIntoCSel(MRI, FalseReg))
794 FalseCycles = 0;
795 return true;
796 }
797
798 // Scalar floating point is handled by fcsel.
799 // FIXME: Form fabs, fmin, and fmax when applicable.
800 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
801 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
802 CondCycles = 5 + ExtraCondLat;
803 TrueCycles = FalseCycles = 2;
804 return true;
805 }
806
807 // Can't do vectors.
808 return false;
809 }
810
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const811 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
812 MachineBasicBlock::iterator I,
813 const DebugLoc &DL, Register DstReg,
814 ArrayRef<MachineOperand> Cond,
815 Register TrueReg, Register FalseReg) const {
816 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
817
818 // Parse the condition code, see parseCondBranch() above.
819 AArch64CC::CondCode CC;
820 switch (Cond.size()) {
821 default:
822 llvm_unreachable("Unknown condition opcode in Cond");
823 case 1: // b.cc
824 CC = AArch64CC::CondCode(Cond[0].getImm());
825 break;
826 case 3: { // cbz/cbnz
827 // We must insert a compare against 0.
828 bool Is64Bit;
829 switch (Cond[1].getImm()) {
830 default:
831 llvm_unreachable("Unknown branch opcode in Cond");
832 case AArch64::CBZW:
833 Is64Bit = false;
834 CC = AArch64CC::EQ;
835 break;
836 case AArch64::CBZX:
837 Is64Bit = true;
838 CC = AArch64CC::EQ;
839 break;
840 case AArch64::CBNZW:
841 Is64Bit = false;
842 CC = AArch64CC::NE;
843 break;
844 case AArch64::CBNZX:
845 Is64Bit = true;
846 CC = AArch64CC::NE;
847 break;
848 }
849 Register SrcReg = Cond[2].getReg();
850 if (Is64Bit) {
851 // cmp reg, #0 is actually subs xzr, reg, #0.
852 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
853 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
854 .addReg(SrcReg)
855 .addImm(0)
856 .addImm(0);
857 } else {
858 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
859 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
860 .addReg(SrcReg)
861 .addImm(0)
862 .addImm(0);
863 }
864 break;
865 }
866 case 4: { // tbz/tbnz
867 // We must insert a tst instruction.
868 switch (Cond[1].getImm()) {
869 default:
870 llvm_unreachable("Unknown branch opcode in Cond");
871 case AArch64::TBZW:
872 case AArch64::TBZX:
873 CC = AArch64CC::EQ;
874 break;
875 case AArch64::TBNZW:
876 case AArch64::TBNZX:
877 CC = AArch64CC::NE;
878 break;
879 }
880 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
881 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
882 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
883 .addReg(Cond[2].getReg())
884 .addImm(
885 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
886 else
887 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
888 .addReg(Cond[2].getReg())
889 .addImm(
890 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
891 break;
892 }
893 case 5: { // cb
894 // We must insert a cmp, that is a subs
895 // 0 1 2 3 4
896 // Cond is { -1, Opcode, CC, Op0, Op1 }
897 unsigned SUBSOpC, SUBSDestReg;
898 bool IsImm = false;
899 CC = static_cast<AArch64CC::CondCode>(Cond[2].getImm());
900 switch (Cond[1].getImm()) {
901 default:
902 llvm_unreachable("Unknown branch opcode in Cond");
903 case AArch64::CBWPri:
904 SUBSOpC = AArch64::SUBSWri;
905 SUBSDestReg = AArch64::WZR;
906 IsImm = true;
907 break;
908 case AArch64::CBXPri:
909 SUBSOpC = AArch64::SUBSXri;
910 SUBSDestReg = AArch64::XZR;
911 IsImm = true;
912 break;
913 case AArch64::CBWPrr:
914 SUBSOpC = AArch64::SUBSWrr;
915 SUBSDestReg = AArch64::WZR;
916 IsImm = false;
917 break;
918 case AArch64::CBXPrr:
919 SUBSOpC = AArch64::SUBSXrr;
920 SUBSDestReg = AArch64::XZR;
921 IsImm = false;
922 break;
923 }
924
925 if (IsImm)
926 BuildMI(MBB, I, DL, get(SUBSOpC), SUBSDestReg)
927 .addReg(Cond[3].getReg())
928 .addImm(Cond[4].getImm())
929 .addImm(0);
930 else
931 BuildMI(MBB, I, DL, get(SUBSOpC), SUBSDestReg)
932 .addReg(Cond[3].getReg())
933 .addReg(Cond[4].getReg());
934 }
935 }
936
937 unsigned Opc = 0;
938 const TargetRegisterClass *RC = nullptr;
939 bool TryFold = false;
940 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
941 RC = &AArch64::GPR64RegClass;
942 Opc = AArch64::CSELXr;
943 TryFold = true;
944 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
945 RC = &AArch64::GPR32RegClass;
946 Opc = AArch64::CSELWr;
947 TryFold = true;
948 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
949 RC = &AArch64::FPR64RegClass;
950 Opc = AArch64::FCSELDrrr;
951 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
952 RC = &AArch64::FPR32RegClass;
953 Opc = AArch64::FCSELSrrr;
954 }
955 assert(RC && "Unsupported regclass");
956
957 // Try folding simple instructions into the csel.
958 if (TryFold) {
959 unsigned NewVReg = 0;
960 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
961 if (FoldedOpc) {
962 // The folded opcodes csinc, csinc and csneg apply the operation to
963 // FalseReg, so we need to invert the condition.
964 CC = AArch64CC::getInvertedCondCode(CC);
965 TrueReg = FalseReg;
966 } else
967 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
968
969 // Fold the operation. Leave any dead instructions for DCE to clean up.
970 if (FoldedOpc) {
971 FalseReg = NewVReg;
972 Opc = FoldedOpc;
973 // The extends the live range of NewVReg.
974 MRI.clearKillFlags(NewVReg);
975 }
976 }
977
978 // Pull all virtual register into the appropriate class.
979 MRI.constrainRegClass(TrueReg, RC);
980 MRI.constrainRegClass(FalseReg, RC);
981
982 // Insert the csel.
983 BuildMI(MBB, I, DL, get(Opc), DstReg)
984 .addReg(TrueReg)
985 .addReg(FalseReg)
986 .addImm(CC);
987 }
988
989 // Return true if Imm can be loaded into a register by a "cheap" sequence of
990 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)991 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
992 if (BitSize == 32)
993 return true;
994
995 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
996 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
997 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
998 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
999
1000 return Is.size() <= 2;
1001 }
1002
1003 // FIXME: this implementation should be micro-architecture dependent, so a
1004 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const1005 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
1006 if (Subtarget.hasExynosCheapAsMoveHandling()) {
1007 if (isExynosCheapAsMove(MI))
1008 return true;
1009 return MI.isAsCheapAsAMove();
1010 }
1011
1012 switch (MI.getOpcode()) {
1013 default:
1014 return MI.isAsCheapAsAMove();
1015
1016 case AArch64::ADDWrs:
1017 case AArch64::ADDXrs:
1018 case AArch64::SUBWrs:
1019 case AArch64::SUBXrs:
1020 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
1021
1022 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
1023 // ORRXri, it is as cheap as MOV.
1024 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
1025 case AArch64::MOVi32imm:
1026 return isCheapImmediate(MI, 32);
1027 case AArch64::MOVi64imm:
1028 return isCheapImmediate(MI, 64);
1029 }
1030 }
1031
isFalkorShiftExtFast(const MachineInstr & MI)1032 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
1033 switch (MI.getOpcode()) {
1034 default:
1035 return false;
1036
1037 case AArch64::ADDWrs:
1038 case AArch64::ADDXrs:
1039 case AArch64::ADDSWrs:
1040 case AArch64::ADDSXrs: {
1041 unsigned Imm = MI.getOperand(3).getImm();
1042 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1043 if (ShiftVal == 0)
1044 return true;
1045 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
1046 }
1047
1048 case AArch64::ADDWrx:
1049 case AArch64::ADDXrx:
1050 case AArch64::ADDXrx64:
1051 case AArch64::ADDSWrx:
1052 case AArch64::ADDSXrx:
1053 case AArch64::ADDSXrx64: {
1054 unsigned Imm = MI.getOperand(3).getImm();
1055 switch (AArch64_AM::getArithExtendType(Imm)) {
1056 default:
1057 return false;
1058 case AArch64_AM::UXTB:
1059 case AArch64_AM::UXTH:
1060 case AArch64_AM::UXTW:
1061 case AArch64_AM::UXTX:
1062 return AArch64_AM::getArithShiftValue(Imm) <= 4;
1063 }
1064 }
1065
1066 case AArch64::SUBWrs:
1067 case AArch64::SUBSWrs: {
1068 unsigned Imm = MI.getOperand(3).getImm();
1069 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1070 return ShiftVal == 0 ||
1071 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
1072 }
1073
1074 case AArch64::SUBXrs:
1075 case AArch64::SUBSXrs: {
1076 unsigned Imm = MI.getOperand(3).getImm();
1077 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
1078 return ShiftVal == 0 ||
1079 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
1080 }
1081
1082 case AArch64::SUBWrx:
1083 case AArch64::SUBXrx:
1084 case AArch64::SUBXrx64:
1085 case AArch64::SUBSWrx:
1086 case AArch64::SUBSXrx:
1087 case AArch64::SUBSXrx64: {
1088 unsigned Imm = MI.getOperand(3).getImm();
1089 switch (AArch64_AM::getArithExtendType(Imm)) {
1090 default:
1091 return false;
1092 case AArch64_AM::UXTB:
1093 case AArch64_AM::UXTH:
1094 case AArch64_AM::UXTW:
1095 case AArch64_AM::UXTX:
1096 return AArch64_AM::getArithShiftValue(Imm) == 0;
1097 }
1098 }
1099
1100 case AArch64::LDRBBroW:
1101 case AArch64::LDRBBroX:
1102 case AArch64::LDRBroW:
1103 case AArch64::LDRBroX:
1104 case AArch64::LDRDroW:
1105 case AArch64::LDRDroX:
1106 case AArch64::LDRHHroW:
1107 case AArch64::LDRHHroX:
1108 case AArch64::LDRHroW:
1109 case AArch64::LDRHroX:
1110 case AArch64::LDRQroW:
1111 case AArch64::LDRQroX:
1112 case AArch64::LDRSBWroW:
1113 case AArch64::LDRSBWroX:
1114 case AArch64::LDRSBXroW:
1115 case AArch64::LDRSBXroX:
1116 case AArch64::LDRSHWroW:
1117 case AArch64::LDRSHWroX:
1118 case AArch64::LDRSHXroW:
1119 case AArch64::LDRSHXroX:
1120 case AArch64::LDRSWroW:
1121 case AArch64::LDRSWroX:
1122 case AArch64::LDRSroW:
1123 case AArch64::LDRSroX:
1124 case AArch64::LDRWroW:
1125 case AArch64::LDRWroX:
1126 case AArch64::LDRXroW:
1127 case AArch64::LDRXroX:
1128 case AArch64::PRFMroW:
1129 case AArch64::PRFMroX:
1130 case AArch64::STRBBroW:
1131 case AArch64::STRBBroX:
1132 case AArch64::STRBroW:
1133 case AArch64::STRBroX:
1134 case AArch64::STRDroW:
1135 case AArch64::STRDroX:
1136 case AArch64::STRHHroW:
1137 case AArch64::STRHHroX:
1138 case AArch64::STRHroW:
1139 case AArch64::STRHroX:
1140 case AArch64::STRQroW:
1141 case AArch64::STRQroX:
1142 case AArch64::STRSroW:
1143 case AArch64::STRSroX:
1144 case AArch64::STRWroW:
1145 case AArch64::STRWroX:
1146 case AArch64::STRXroW:
1147 case AArch64::STRXroX: {
1148 unsigned IsSigned = MI.getOperand(3).getImm();
1149 return !IsSigned;
1150 }
1151 }
1152 }
1153
isSEHInstruction(const MachineInstr & MI)1154 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1155 unsigned Opc = MI.getOpcode();
1156 switch (Opc) {
1157 default:
1158 return false;
1159 case AArch64::SEH_StackAlloc:
1160 case AArch64::SEH_SaveFPLR:
1161 case AArch64::SEH_SaveFPLR_X:
1162 case AArch64::SEH_SaveReg:
1163 case AArch64::SEH_SaveReg_X:
1164 case AArch64::SEH_SaveRegP:
1165 case AArch64::SEH_SaveRegP_X:
1166 case AArch64::SEH_SaveFReg:
1167 case AArch64::SEH_SaveFReg_X:
1168 case AArch64::SEH_SaveFRegP:
1169 case AArch64::SEH_SaveFRegP_X:
1170 case AArch64::SEH_SetFP:
1171 case AArch64::SEH_AddFP:
1172 case AArch64::SEH_Nop:
1173 case AArch64::SEH_PrologEnd:
1174 case AArch64::SEH_EpilogStart:
1175 case AArch64::SEH_EpilogEnd:
1176 case AArch64::SEH_PACSignLR:
1177 case AArch64::SEH_SaveAnyRegQP:
1178 case AArch64::SEH_SaveAnyRegQPX:
1179 case AArch64::SEH_AllocZ:
1180 case AArch64::SEH_SaveZReg:
1181 case AArch64::SEH_SavePReg:
1182 return true;
1183 }
1184 }
1185
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1186 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1187 Register &SrcReg, Register &DstReg,
1188 unsigned &SubIdx) const {
1189 switch (MI.getOpcode()) {
1190 default:
1191 return false;
1192 case AArch64::SBFMXri: // aka sxtw
1193 case AArch64::UBFMXri: // aka uxtw
1194 // Check for the 32 -> 64 bit extension case, these instructions can do
1195 // much more.
1196 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1197 return false;
1198 // This is a signed or unsigned 32 -> 64 bit extension.
1199 SrcReg = MI.getOperand(1).getReg();
1200 DstReg = MI.getOperand(0).getReg();
1201 SubIdx = AArch64::sub_32;
1202 return true;
1203 }
1204 }
1205
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1206 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1207 const MachineInstr &MIa, const MachineInstr &MIb) const {
1208 const TargetRegisterInfo *TRI = &getRegisterInfo();
1209 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1210 int64_t OffsetA = 0, OffsetB = 0;
1211 TypeSize WidthA(0, false), WidthB(0, false);
1212 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1213
1214 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1215 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1216
1217 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1218 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1219 return false;
1220
1221 // Retrieve the base, offset from the base and width. Width
1222 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1223 // base are identical, and the offset of a lower memory access +
1224 // the width doesn't overlap the offset of a higher memory access,
1225 // then the memory accesses are different.
1226 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1227 // are assumed to have the same scale (vscale).
1228 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1229 WidthA, TRI) &&
1230 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1231 WidthB, TRI)) {
1232 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1233 OffsetAIsScalable == OffsetBIsScalable) {
1234 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1235 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1236 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1237 if (LowWidth.isScalable() == OffsetAIsScalable &&
1238 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1239 return true;
1240 }
1241 }
1242 return false;
1243 }
1244
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1245 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1246 const MachineBasicBlock *MBB,
1247 const MachineFunction &MF) const {
1248 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1249 return true;
1250
1251 // Do not move an instruction that can be recognized as a branch target.
1252 if (hasBTISemantics(MI))
1253 return true;
1254
1255 switch (MI.getOpcode()) {
1256 case AArch64::HINT:
1257 // CSDB hints are scheduling barriers.
1258 if (MI.getOperand(0).getImm() == 0x14)
1259 return true;
1260 break;
1261 case AArch64::DSB:
1262 case AArch64::ISB:
1263 // DSB and ISB also are scheduling barriers.
1264 return true;
1265 case AArch64::MSRpstatesvcrImm1:
1266 // SMSTART and SMSTOP are also scheduling barriers.
1267 return true;
1268 default:;
1269 }
1270 if (isSEHInstruction(MI))
1271 return true;
1272 auto Next = std::next(MI.getIterator());
1273 return Next != MBB->end() && Next->isCFIInstruction();
1274 }
1275
1276 /// analyzeCompare - For a comparison instruction, return the source registers
1277 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1278 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1279 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1280 Register &SrcReg2, int64_t &CmpMask,
1281 int64_t &CmpValue) const {
1282 // The first operand can be a frame index where we'd normally expect a
1283 // register.
1284 // FIXME: Pass subregisters out of analyzeCompare
1285 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1286 if (!MI.getOperand(1).isReg() || MI.getOperand(1).getSubReg())
1287 return false;
1288
1289 switch (MI.getOpcode()) {
1290 default:
1291 break;
1292 case AArch64::PTEST_PP:
1293 case AArch64::PTEST_PP_ANY:
1294 SrcReg = MI.getOperand(0).getReg();
1295 SrcReg2 = MI.getOperand(1).getReg();
1296 if (MI.getOperand(2).getSubReg())
1297 return false;
1298
1299 // Not sure about the mask and value for now...
1300 CmpMask = ~0;
1301 CmpValue = 0;
1302 return true;
1303 case AArch64::SUBSWrr:
1304 case AArch64::SUBSWrs:
1305 case AArch64::SUBSWrx:
1306 case AArch64::SUBSXrr:
1307 case AArch64::SUBSXrs:
1308 case AArch64::SUBSXrx:
1309 case AArch64::ADDSWrr:
1310 case AArch64::ADDSWrs:
1311 case AArch64::ADDSWrx:
1312 case AArch64::ADDSXrr:
1313 case AArch64::ADDSXrs:
1314 case AArch64::ADDSXrx:
1315 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1316 SrcReg = MI.getOperand(1).getReg();
1317 SrcReg2 = MI.getOperand(2).getReg();
1318
1319 // FIXME: Pass subregisters out of analyzeCompare
1320 if (MI.getOperand(2).getSubReg())
1321 return false;
1322
1323 CmpMask = ~0;
1324 CmpValue = 0;
1325 return true;
1326 case AArch64::SUBSWri:
1327 case AArch64::ADDSWri:
1328 case AArch64::SUBSXri:
1329 case AArch64::ADDSXri:
1330 SrcReg = MI.getOperand(1).getReg();
1331 SrcReg2 = 0;
1332 CmpMask = ~0;
1333 CmpValue = MI.getOperand(2).getImm();
1334 return true;
1335 case AArch64::ANDSWri:
1336 case AArch64::ANDSXri:
1337 // ANDS does not use the same encoding scheme as the others xxxS
1338 // instructions.
1339 SrcReg = MI.getOperand(1).getReg();
1340 SrcReg2 = 0;
1341 CmpMask = ~0;
1342 CmpValue = AArch64_AM::decodeLogicalImmediate(
1343 MI.getOperand(2).getImm(),
1344 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1345 return true;
1346 }
1347
1348 return false;
1349 }
1350
UpdateOperandRegClass(MachineInstr & Instr)1351 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1352 MachineBasicBlock *MBB = Instr.getParent();
1353 assert(MBB && "Can't get MachineBasicBlock here");
1354 MachineFunction *MF = MBB->getParent();
1355 assert(MF && "Can't get MachineFunction here");
1356 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1357 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1358 MachineRegisterInfo *MRI = &MF->getRegInfo();
1359
1360 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1361 ++OpIdx) {
1362 MachineOperand &MO = Instr.getOperand(OpIdx);
1363 const TargetRegisterClass *OpRegCstraints =
1364 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1365
1366 // If there's no constraint, there's nothing to do.
1367 if (!OpRegCstraints)
1368 continue;
1369 // If the operand is a frame index, there's nothing to do here.
1370 // A frame index operand will resolve correctly during PEI.
1371 if (MO.isFI())
1372 continue;
1373
1374 assert(MO.isReg() &&
1375 "Operand has register constraints without being a register!");
1376
1377 Register Reg = MO.getReg();
1378 if (Reg.isPhysical()) {
1379 if (!OpRegCstraints->contains(Reg))
1380 return false;
1381 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1382 !MRI->constrainRegClass(Reg, OpRegCstraints))
1383 return false;
1384 }
1385
1386 return true;
1387 }
1388
1389 /// Return the opcode that does not set flags when possible - otherwise
1390 /// return the original opcode. The caller is responsible to do the actual
1391 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1392 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1393 // Don't convert all compare instructions, because for some the zero register
1394 // encoding becomes the sp register.
1395 bool MIDefinesZeroReg = false;
1396 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1397 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1398 MIDefinesZeroReg = true;
1399
1400 switch (MI.getOpcode()) {
1401 default:
1402 return MI.getOpcode();
1403 case AArch64::ADDSWrr:
1404 return AArch64::ADDWrr;
1405 case AArch64::ADDSWri:
1406 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1407 case AArch64::ADDSWrs:
1408 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1409 case AArch64::ADDSWrx:
1410 return AArch64::ADDWrx;
1411 case AArch64::ADDSXrr:
1412 return AArch64::ADDXrr;
1413 case AArch64::ADDSXri:
1414 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1415 case AArch64::ADDSXrs:
1416 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1417 case AArch64::ADDSXrx:
1418 return AArch64::ADDXrx;
1419 case AArch64::SUBSWrr:
1420 return AArch64::SUBWrr;
1421 case AArch64::SUBSWri:
1422 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1423 case AArch64::SUBSWrs:
1424 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1425 case AArch64::SUBSWrx:
1426 return AArch64::SUBWrx;
1427 case AArch64::SUBSXrr:
1428 return AArch64::SUBXrr;
1429 case AArch64::SUBSXri:
1430 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1431 case AArch64::SUBSXrs:
1432 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1433 case AArch64::SUBSXrx:
1434 return AArch64::SUBXrx;
1435 }
1436 }
1437
1438 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1439
1440 /// True when condition flags are accessed (either by writing or reading)
1441 /// on the instruction trace starting at From and ending at To.
1442 ///
1443 /// Note: If From and To are from different blocks it's assumed CC are accessed
1444 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1445 static bool areCFlagsAccessedBetweenInstrs(
1446 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1447 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1448 // Early exit if To is at the beginning of the BB.
1449 if (To == To->getParent()->begin())
1450 return true;
1451
1452 // Check whether the instructions are in the same basic block
1453 // If not, assume the condition flags might get modified somewhere.
1454 if (To->getParent() != From->getParent())
1455 return true;
1456
1457 // From must be above To.
1458 assert(std::any_of(
1459 ++To.getReverse(), To->getParent()->rend(),
1460 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1461
1462 // We iterate backward starting at \p To until we hit \p From.
1463 for (const MachineInstr &Instr :
1464 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1465 if (((AccessToCheck & AK_Write) &&
1466 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1467 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1468 return true;
1469 }
1470 return false;
1471 }
1472
1473 std::optional<unsigned>
canRemovePTestInstr(MachineInstr * PTest,MachineInstr * Mask,MachineInstr * Pred,const MachineRegisterInfo * MRI) const1474 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1475 MachineInstr *Pred,
1476 const MachineRegisterInfo *MRI) const {
1477 unsigned MaskOpcode = Mask->getOpcode();
1478 unsigned PredOpcode = Pred->getOpcode();
1479 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1480 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1481
1482 if (PredIsWhileLike) {
1483 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1484 // instruction and the condition is "any" since WHILcc does an implicit
1485 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1486 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1487 return PredOpcode;
1488
1489 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1490 // redundant since WHILE performs an implicit PTEST with an all active
1491 // mask.
1492 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1493 getElementSizeForOpcode(MaskOpcode) ==
1494 getElementSizeForOpcode(PredOpcode))
1495 return PredOpcode;
1496
1497 return {};
1498 }
1499
1500 if (PredIsPTestLike) {
1501 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1502 // instruction that sets the flags as PTEST would and the condition is
1503 // "any" since PG is always a subset of the governing predicate of the
1504 // ptest-like instruction.
1505 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1506 return PredOpcode;
1507
1508 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1509
1510 // If the PTEST like instruction's general predicate is not `Mask`, attempt
1511 // to look through a copy and try again. This is because some instructions
1512 // take a predicate whose register class is a subset of its result class.
1513 if (Mask != PTestLikeMask && PTestLikeMask->isFullCopy() &&
1514 PTestLikeMask->getOperand(1).getReg().isVirtual())
1515 PTestLikeMask =
1516 MRI->getUniqueVRegDef(PTestLikeMask->getOperand(1).getReg());
1517
1518 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1519 // the element size matches and either the PTEST_LIKE instruction uses
1520 // the same all active mask or the condition is "any".
1521 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1522 getElementSizeForOpcode(MaskOpcode) ==
1523 getElementSizeForOpcode(PredOpcode)) {
1524 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1525 return PredOpcode;
1526 }
1527
1528 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1529 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1530 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1531 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1532 // performed by the compare could consider fewer lanes for these element
1533 // sizes.
1534 //
1535 // For example, consider
1536 //
1537 // ptrue p0.b ; P0=1111-1111-1111-1111
1538 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1539 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1540 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1541 // ; ^ last active
1542 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1543 // ; ^ last active
1544 //
1545 // where the compare generates a canonical all active 32-bit predicate
1546 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1547 // active flag, whereas the PTEST instruction with the same mask doesn't.
1548 // For PTEST_ANY this doesn't apply as the flags in this case would be
1549 // identical regardless of element size.
1550 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1551 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1552 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1553 return PredOpcode;
1554
1555 return {};
1556 }
1557
1558 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1559 // opcode so the PTEST becomes redundant.
1560 switch (PredOpcode) {
1561 case AArch64::AND_PPzPP:
1562 case AArch64::BIC_PPzPP:
1563 case AArch64::EOR_PPzPP:
1564 case AArch64::NAND_PPzPP:
1565 case AArch64::NOR_PPzPP:
1566 case AArch64::ORN_PPzPP:
1567 case AArch64::ORR_PPzPP:
1568 case AArch64::BRKA_PPzP:
1569 case AArch64::BRKPA_PPzPP:
1570 case AArch64::BRKB_PPzP:
1571 case AArch64::BRKPB_PPzPP:
1572 case AArch64::RDFFR_PPz: {
1573 // Check to see if our mask is the same. If not the resulting flag bits
1574 // may be different and we can't remove the ptest.
1575 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1576 if (Mask != PredMask)
1577 return {};
1578 break;
1579 }
1580 case AArch64::BRKN_PPzP: {
1581 // BRKN uses an all active implicit mask to set flags unlike the other
1582 // flag-setting instructions.
1583 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1584 if ((MaskOpcode != AArch64::PTRUE_B) ||
1585 (Mask->getOperand(1).getImm() != 31))
1586 return {};
1587 break;
1588 }
1589 case AArch64::PTRUE_B:
1590 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1591 break;
1592 default:
1593 // Bail out if we don't recognize the input
1594 return {};
1595 }
1596
1597 return convertToFlagSettingOpc(PredOpcode);
1598 }
1599
1600 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1601 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1602 bool AArch64InstrInfo::optimizePTestInstr(
1603 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1604 const MachineRegisterInfo *MRI) const {
1605 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1606 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1607 unsigned PredOpcode = Pred->getOpcode();
1608 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1609 if (!NewOp)
1610 return false;
1611
1612 const TargetRegisterInfo *TRI = &getRegisterInfo();
1613
1614 // If another instruction between Pred and PTest accesses flags, don't remove
1615 // the ptest or update the earlier instruction to modify them.
1616 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1617 return false;
1618
1619 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1620 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1621 // operand to be replaced with an equivalent instruction that also sets the
1622 // flags.
1623 PTest->eraseFromParent();
1624 if (*NewOp != PredOpcode) {
1625 Pred->setDesc(get(*NewOp));
1626 bool succeeded = UpdateOperandRegClass(*Pred);
1627 (void)succeeded;
1628 assert(succeeded && "Operands have incompatible register classes!");
1629 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1630 }
1631
1632 // Ensure that the flags def is live.
1633 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1634 unsigned i = 0, e = Pred->getNumOperands();
1635 for (; i != e; ++i) {
1636 MachineOperand &MO = Pred->getOperand(i);
1637 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1638 MO.setIsDead(false);
1639 break;
1640 }
1641 }
1642 }
1643 return true;
1644 }
1645
1646 /// Try to optimize a compare instruction. A compare instruction is an
1647 /// instruction which produces AArch64::NZCV. It can be truly compare
1648 /// instruction
1649 /// when there are no uses of its destination register.
1650 ///
1651 /// The following steps are tried in order:
1652 /// 1. Convert CmpInstr into an unconditional version.
1653 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1654 /// condition code or an instruction which can be converted into such an
1655 /// instruction.
1656 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1657 bool AArch64InstrInfo::optimizeCompareInstr(
1658 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1659 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1660 assert(CmpInstr.getParent());
1661 assert(MRI);
1662
1663 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1664 int DeadNZCVIdx =
1665 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1666 if (DeadNZCVIdx != -1) {
1667 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1668 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1669 CmpInstr.eraseFromParent();
1670 return true;
1671 }
1672 unsigned Opc = CmpInstr.getOpcode();
1673 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1674 if (NewOpc == Opc)
1675 return false;
1676 const MCInstrDesc &MCID = get(NewOpc);
1677 CmpInstr.setDesc(MCID);
1678 CmpInstr.removeOperand(DeadNZCVIdx);
1679 bool succeeded = UpdateOperandRegClass(CmpInstr);
1680 (void)succeeded;
1681 assert(succeeded && "Some operands reg class are incompatible!");
1682 return true;
1683 }
1684
1685 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1686 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1687 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1688
1689 if (SrcReg2 != 0)
1690 return false;
1691
1692 // CmpInstr is a Compare instruction if destination register is not used.
1693 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1694 return false;
1695
1696 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1697 return true;
1698 return (CmpValue == 0 || CmpValue == 1) &&
1699 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1700 }
1701
1702 /// Get opcode of S version of Instr.
1703 /// If Instr is S version its opcode is returned.
1704 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1705 /// or we are not interested in it.
sForm(MachineInstr & Instr)1706 static unsigned sForm(MachineInstr &Instr) {
1707 switch (Instr.getOpcode()) {
1708 default:
1709 return AArch64::INSTRUCTION_LIST_END;
1710
1711 case AArch64::ADDSWrr:
1712 case AArch64::ADDSWri:
1713 case AArch64::ADDSXrr:
1714 case AArch64::ADDSXri:
1715 case AArch64::SUBSWrr:
1716 case AArch64::SUBSWri:
1717 case AArch64::SUBSXrr:
1718 case AArch64::SUBSXri:
1719 return Instr.getOpcode();
1720
1721 case AArch64::ADDWrr:
1722 return AArch64::ADDSWrr;
1723 case AArch64::ADDWri:
1724 return AArch64::ADDSWri;
1725 case AArch64::ADDXrr:
1726 return AArch64::ADDSXrr;
1727 case AArch64::ADDXri:
1728 return AArch64::ADDSXri;
1729 case AArch64::ADCWr:
1730 return AArch64::ADCSWr;
1731 case AArch64::ADCXr:
1732 return AArch64::ADCSXr;
1733 case AArch64::SUBWrr:
1734 return AArch64::SUBSWrr;
1735 case AArch64::SUBWri:
1736 return AArch64::SUBSWri;
1737 case AArch64::SUBXrr:
1738 return AArch64::SUBSXrr;
1739 case AArch64::SUBXri:
1740 return AArch64::SUBSXri;
1741 case AArch64::SBCWr:
1742 return AArch64::SBCSWr;
1743 case AArch64::SBCXr:
1744 return AArch64::SBCSXr;
1745 case AArch64::ANDWri:
1746 return AArch64::ANDSWri;
1747 case AArch64::ANDXri:
1748 return AArch64::ANDSXri;
1749 }
1750 }
1751
1752 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1753 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1754 for (auto *BB : MBB->successors())
1755 if (BB->isLiveIn(AArch64::NZCV))
1756 return true;
1757 return false;
1758 }
1759
1760 /// \returns The condition code operand index for \p Instr if it is a branch
1761 /// or select and -1 otherwise.
1762 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1763 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1764 switch (Instr.getOpcode()) {
1765 default:
1766 return -1;
1767
1768 case AArch64::Bcc: {
1769 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1770 assert(Idx >= 2);
1771 return Idx - 2;
1772 }
1773
1774 case AArch64::CSINVWr:
1775 case AArch64::CSINVXr:
1776 case AArch64::CSINCWr:
1777 case AArch64::CSINCXr:
1778 case AArch64::CSELWr:
1779 case AArch64::CSELXr:
1780 case AArch64::CSNEGWr:
1781 case AArch64::CSNEGXr:
1782 case AArch64::FCSELSrrr:
1783 case AArch64::FCSELDrrr: {
1784 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1785 assert(Idx >= 1);
1786 return Idx - 1;
1787 }
1788 }
1789 }
1790
1791 /// Find a condition code used by the instruction.
1792 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1793 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1794 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1795 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1796 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1797 Instr.getOperand(CCIdx).getImm())
1798 : AArch64CC::Invalid;
1799 }
1800
getUsedNZCV(AArch64CC::CondCode CC)1801 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1802 assert(CC != AArch64CC::Invalid);
1803 UsedNZCV UsedFlags;
1804 switch (CC) {
1805 default:
1806 break;
1807
1808 case AArch64CC::EQ: // Z set
1809 case AArch64CC::NE: // Z clear
1810 UsedFlags.Z = true;
1811 break;
1812
1813 case AArch64CC::HI: // Z clear and C set
1814 case AArch64CC::LS: // Z set or C clear
1815 UsedFlags.Z = true;
1816 [[fallthrough]];
1817 case AArch64CC::HS: // C set
1818 case AArch64CC::LO: // C clear
1819 UsedFlags.C = true;
1820 break;
1821
1822 case AArch64CC::MI: // N set
1823 case AArch64CC::PL: // N clear
1824 UsedFlags.N = true;
1825 break;
1826
1827 case AArch64CC::VS: // V set
1828 case AArch64CC::VC: // V clear
1829 UsedFlags.V = true;
1830 break;
1831
1832 case AArch64CC::GT: // Z clear, N and V the same
1833 case AArch64CC::LE: // Z set, N and V differ
1834 UsedFlags.Z = true;
1835 [[fallthrough]];
1836 case AArch64CC::GE: // N and V the same
1837 case AArch64CC::LT: // N and V differ
1838 UsedFlags.N = true;
1839 UsedFlags.V = true;
1840 break;
1841 }
1842 return UsedFlags;
1843 }
1844
1845 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1846 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1847 /// \returns std::nullopt otherwise.
1848 ///
1849 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1850 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1851 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1852 const TargetRegisterInfo &TRI,
1853 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1854 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1855 if (MI.getParent() != CmpParent)
1856 return std::nullopt;
1857
1858 if (areCFlagsAliveInSuccessors(CmpParent))
1859 return std::nullopt;
1860
1861 UsedNZCV NZCVUsedAfterCmp;
1862 for (MachineInstr &Instr : instructionsWithoutDebug(
1863 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1864 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1865 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1866 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1867 return std::nullopt;
1868 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1869 if (CCUseInstrs)
1870 CCUseInstrs->push_back(&Instr);
1871 }
1872 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1873 break;
1874 }
1875 return NZCVUsedAfterCmp;
1876 }
1877
isADDSRegImm(unsigned Opcode)1878 static bool isADDSRegImm(unsigned Opcode) {
1879 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1880 }
1881
isSUBSRegImm(unsigned Opcode)1882 static bool isSUBSRegImm(unsigned Opcode) {
1883 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1884 }
1885
1886 /// Check if CmpInstr can be substituted by MI.
1887 ///
1888 /// CmpInstr can be substituted:
1889 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1890 /// - and, MI and CmpInstr are from the same MachineBB
1891 /// - and, condition flags are not alive in successors of the CmpInstr parent
1892 /// - and, if MI opcode is the S form there must be no defs of flags between
1893 /// MI and CmpInstr
1894 /// or if MI opcode is not the S form there must be neither defs of flags
1895 /// nor uses of flags between MI and CmpInstr.
1896 /// - and, if C/V flags are not used after CmpInstr
1897 /// or if N flag is used but MI produces poison value if signed overflow
1898 /// occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1899 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1900 const TargetRegisterInfo &TRI) {
1901 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1902 // that may or may not set flags.
1903 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1904
1905 const unsigned CmpOpcode = CmpInstr.getOpcode();
1906 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1907 return false;
1908
1909 assert((CmpInstr.getOperand(2).isImm() &&
1910 CmpInstr.getOperand(2).getImm() == 0) &&
1911 "Caller guarantees that CmpInstr compares with constant 0");
1912
1913 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1914 if (!NZVCUsed || NZVCUsed->C)
1915 return false;
1916
1917 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1918 // '%vreg = add ...' or '%vreg = sub ...'.
1919 // Condition flag V is used to indicate signed overflow.
1920 // 1) MI and CmpInstr set N and V to the same value.
1921 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1922 // signed overflow occurs, so CmpInstr could still be simplified away.
1923 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1924 return false;
1925
1926 AccessKind AccessToCheck = AK_Write;
1927 if (sForm(MI) != MI.getOpcode())
1928 AccessToCheck = AK_All;
1929 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1930 }
1931
1932 /// Substitute an instruction comparing to zero with another instruction
1933 /// which produces needed condition flags.
1934 ///
1935 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1936 bool AArch64InstrInfo::substituteCmpToZero(
1937 MachineInstr &CmpInstr, unsigned SrcReg,
1938 const MachineRegisterInfo &MRI) const {
1939 // Get the unique definition of SrcReg.
1940 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1941 if (!MI)
1942 return false;
1943
1944 const TargetRegisterInfo &TRI = getRegisterInfo();
1945
1946 unsigned NewOpc = sForm(*MI);
1947 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1948 return false;
1949
1950 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1951 return false;
1952
1953 // Update the instruction to set NZCV.
1954 MI->setDesc(get(NewOpc));
1955 CmpInstr.eraseFromParent();
1956 bool succeeded = UpdateOperandRegClass(*MI);
1957 (void)succeeded;
1958 assert(succeeded && "Some operands reg class are incompatible!");
1959 MI->addRegisterDefined(AArch64::NZCV, &TRI);
1960 return true;
1961 }
1962
1963 /// \returns True if \p CmpInstr can be removed.
1964 ///
1965 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1966 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1967 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1968 int CmpValue, const TargetRegisterInfo &TRI,
1969 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1970 bool &IsInvertCC) {
1971 assert((CmpValue == 0 || CmpValue == 1) &&
1972 "Only comparisons to 0 or 1 considered for removal!");
1973
1974 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1975 unsigned MIOpc = MI.getOpcode();
1976 if (MIOpc == AArch64::CSINCWr) {
1977 if (MI.getOperand(1).getReg() != AArch64::WZR ||
1978 MI.getOperand(2).getReg() != AArch64::WZR)
1979 return false;
1980 } else if (MIOpc == AArch64::CSINCXr) {
1981 if (MI.getOperand(1).getReg() != AArch64::XZR ||
1982 MI.getOperand(2).getReg() != AArch64::XZR)
1983 return false;
1984 } else {
1985 return false;
1986 }
1987 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1988 if (MICC == AArch64CC::Invalid)
1989 return false;
1990
1991 // NZCV needs to be defined
1992 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1993 return false;
1994
1995 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1996 const unsigned CmpOpcode = CmpInstr.getOpcode();
1997 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1998 if (CmpValue && !IsSubsRegImm)
1999 return false;
2000 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
2001 return false;
2002
2003 // MI conditions allowed: eq, ne, mi, pl
2004 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
2005 if (MIUsedNZCV.C || MIUsedNZCV.V)
2006 return false;
2007
2008 std::optional<UsedNZCV> NZCVUsedAfterCmp =
2009 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
2010 // Condition flags are not used in CmpInstr basic block successors and only
2011 // Z or N flags allowed to be used after CmpInstr within its basic block
2012 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
2013 return false;
2014 // Z or N flag used after CmpInstr must correspond to the flag used in MI
2015 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
2016 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
2017 return false;
2018 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
2019 if (MIUsedNZCV.N && !CmpValue)
2020 return false;
2021
2022 // There must be no defs of flags between MI and CmpInstr
2023 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
2024 return false;
2025
2026 // Condition code is inverted in the following cases:
2027 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
2028 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
2029 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
2030 (!CmpValue && MICC == AArch64CC::NE);
2031 return true;
2032 }
2033
2034 /// Remove comparison in csinc-cmp sequence
2035 ///
2036 /// Examples:
2037 /// 1. \code
2038 /// csinc w9, wzr, wzr, ne
2039 /// cmp w9, #0
2040 /// b.eq
2041 /// \endcode
2042 /// to
2043 /// \code
2044 /// csinc w9, wzr, wzr, ne
2045 /// b.ne
2046 /// \endcode
2047 ///
2048 /// 2. \code
2049 /// csinc x2, xzr, xzr, mi
2050 /// cmp x2, #1
2051 /// b.pl
2052 /// \endcode
2053 /// to
2054 /// \code
2055 /// csinc x2, xzr, xzr, mi
2056 /// b.pl
2057 /// \endcode
2058 ///
2059 /// \param CmpInstr comparison instruction
2060 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const2061 bool AArch64InstrInfo::removeCmpToZeroOrOne(
2062 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
2063 const MachineRegisterInfo &MRI) const {
2064 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
2065 if (!MI)
2066 return false;
2067 const TargetRegisterInfo &TRI = getRegisterInfo();
2068 SmallVector<MachineInstr *, 4> CCUseInstrs;
2069 bool IsInvertCC = false;
2070 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
2071 IsInvertCC))
2072 return false;
2073 // Make transformation
2074 CmpInstr.eraseFromParent();
2075 if (IsInvertCC) {
2076 // Invert condition codes in CmpInstr CC users
2077 for (MachineInstr *CCUseInstr : CCUseInstrs) {
2078 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
2079 assert(Idx >= 0 && "Unexpected instruction using CC.");
2080 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
2081 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
2082 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
2083 CCOperand.setImm(CCUse);
2084 }
2085 }
2086 return true;
2087 }
2088
expandPostRAPseudo(MachineInstr & MI) const2089 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
2090 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
2091 MI.getOpcode() != AArch64::CATCHRET)
2092 return false;
2093
2094 MachineBasicBlock &MBB = *MI.getParent();
2095 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
2096 auto TRI = Subtarget.getRegisterInfo();
2097 DebugLoc DL = MI.getDebugLoc();
2098
2099 if (MI.getOpcode() == AArch64::CATCHRET) {
2100 // Skip to the first instruction before the epilog.
2101 const TargetInstrInfo *TII =
2102 MBB.getParent()->getSubtarget().getInstrInfo();
2103 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
2104 auto MBBI = MachineBasicBlock::iterator(MI);
2105 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
2106 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
2107 FirstEpilogSEH != MBB.begin())
2108 FirstEpilogSEH = std::prev(FirstEpilogSEH);
2109 if (FirstEpilogSEH != MBB.begin())
2110 FirstEpilogSEH = std::next(FirstEpilogSEH);
2111 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
2112 .addReg(AArch64::X0, RegState::Define)
2113 .addMBB(TargetMBB);
2114 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
2115 .addReg(AArch64::X0, RegState::Define)
2116 .addReg(AArch64::X0)
2117 .addMBB(TargetMBB)
2118 .addImm(0);
2119 TargetMBB->setMachineBlockAddressTaken();
2120 return true;
2121 }
2122
2123 Register Reg = MI.getOperand(0).getReg();
2124 Module &M = *MBB.getParent()->getFunction().getParent();
2125 if (M.getStackProtectorGuard() == "sysreg") {
2126 const AArch64SysReg::SysReg *SrcReg =
2127 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2128 if (!SrcReg)
2129 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2130
2131 // mrs xN, sysreg
2132 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2133 .addDef(Reg, RegState::Renamable)
2134 .addImm(SrcReg->Encoding);
2135 int Offset = M.getStackProtectorGuardOffset();
2136 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2137 // ldr xN, [xN, #offset]
2138 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2139 .addDef(Reg)
2140 .addUse(Reg, RegState::Kill)
2141 .addImm(Offset / 8);
2142 } else if (Offset >= -256 && Offset <= 255) {
2143 // ldur xN, [xN, #offset]
2144 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2145 .addDef(Reg)
2146 .addUse(Reg, RegState::Kill)
2147 .addImm(Offset);
2148 } else if (Offset >= -4095 && Offset <= 4095) {
2149 if (Offset > 0) {
2150 // add xN, xN, #offset
2151 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2152 .addDef(Reg)
2153 .addUse(Reg, RegState::Kill)
2154 .addImm(Offset)
2155 .addImm(0);
2156 } else {
2157 // sub xN, xN, #offset
2158 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2159 .addDef(Reg)
2160 .addUse(Reg, RegState::Kill)
2161 .addImm(-Offset)
2162 .addImm(0);
2163 }
2164 // ldr xN, [xN]
2165 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2166 .addDef(Reg)
2167 .addUse(Reg, RegState::Kill)
2168 .addImm(0);
2169 } else {
2170 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2171 // than 23760.
2172 // It might be nice to use AArch64::MOVi32imm here, which would get
2173 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2174 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2175 // AArch64FrameLowering might help us find such a scratch register
2176 // though. If we failed to find a scratch register, we could emit a
2177 // stream of add instructions to build up the immediate. Or, we could try
2178 // to insert a AArch64::MOVi32imm before register allocation so that we
2179 // didn't need to scavenge for a scratch register.
2180 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2181 }
2182 MBB.erase(MI);
2183 return true;
2184 }
2185
2186 const GlobalValue *GV =
2187 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2188 const TargetMachine &TM = MBB.getParent()->getTarget();
2189 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2190 const unsigned char MO_NC = AArch64II::MO_NC;
2191
2192 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2193 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2194 .addGlobalAddress(GV, 0, OpFlags);
2195 if (Subtarget.isTargetILP32()) {
2196 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2197 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2198 .addDef(Reg32, RegState::Dead)
2199 .addUse(Reg, RegState::Kill)
2200 .addImm(0)
2201 .addMemOperand(*MI.memoperands_begin())
2202 .addDef(Reg, RegState::Implicit);
2203 } else {
2204 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2205 .addReg(Reg, RegState::Kill)
2206 .addImm(0)
2207 .addMemOperand(*MI.memoperands_begin());
2208 }
2209 } else if (TM.getCodeModel() == CodeModel::Large) {
2210 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2211 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2212 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2213 .addImm(0);
2214 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2215 .addReg(Reg, RegState::Kill)
2216 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2217 .addImm(16);
2218 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2219 .addReg(Reg, RegState::Kill)
2220 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2221 .addImm(32);
2222 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2223 .addReg(Reg, RegState::Kill)
2224 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2225 .addImm(48);
2226 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2227 .addReg(Reg, RegState::Kill)
2228 .addImm(0)
2229 .addMemOperand(*MI.memoperands_begin());
2230 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2231 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2232 .addGlobalAddress(GV, 0, OpFlags);
2233 } else {
2234 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2235 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2236 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2237 if (Subtarget.isTargetILP32()) {
2238 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2239 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2240 .addDef(Reg32, RegState::Dead)
2241 .addUse(Reg, RegState::Kill)
2242 .addGlobalAddress(GV, 0, LoFlags)
2243 .addMemOperand(*MI.memoperands_begin())
2244 .addDef(Reg, RegState::Implicit);
2245 } else {
2246 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2247 .addReg(Reg, RegState::Kill)
2248 .addGlobalAddress(GV, 0, LoFlags)
2249 .addMemOperand(*MI.memoperands_begin());
2250 }
2251 }
2252
2253 MBB.erase(MI);
2254
2255 return true;
2256 }
2257
2258 // Return true if this instruction simply sets its single destination register
2259 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2260 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2261 switch (MI.getOpcode()) {
2262 default:
2263 break;
2264 case AArch64::MOVZWi:
2265 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2266 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2267 assert(MI.getDesc().getNumOperands() == 3 &&
2268 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2269 return true;
2270 }
2271 break;
2272 case AArch64::ANDWri: // and Rd, Rzr, #imm
2273 return MI.getOperand(1).getReg() == AArch64::WZR;
2274 case AArch64::ANDXri:
2275 return MI.getOperand(1).getReg() == AArch64::XZR;
2276 case TargetOpcode::COPY:
2277 return MI.getOperand(1).getReg() == AArch64::WZR;
2278 }
2279 return false;
2280 }
2281
2282 // Return true if this instruction simply renames a general register without
2283 // modifying bits.
isGPRCopy(const MachineInstr & MI)2284 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2285 switch (MI.getOpcode()) {
2286 default:
2287 break;
2288 case TargetOpcode::COPY: {
2289 // GPR32 copies will by lowered to ORRXrs
2290 Register DstReg = MI.getOperand(0).getReg();
2291 return (AArch64::GPR32RegClass.contains(DstReg) ||
2292 AArch64::GPR64RegClass.contains(DstReg));
2293 }
2294 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2295 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2296 assert(MI.getDesc().getNumOperands() == 4 &&
2297 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2298 return true;
2299 }
2300 break;
2301 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2302 if (MI.getOperand(2).getImm() == 0) {
2303 assert(MI.getDesc().getNumOperands() == 4 &&
2304 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2305 return true;
2306 }
2307 break;
2308 }
2309 return false;
2310 }
2311
2312 // Return true if this instruction simply renames a general register without
2313 // modifying bits.
isFPRCopy(const MachineInstr & MI)2314 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2315 switch (MI.getOpcode()) {
2316 default:
2317 break;
2318 case TargetOpcode::COPY: {
2319 Register DstReg = MI.getOperand(0).getReg();
2320 return AArch64::FPR128RegClass.contains(DstReg);
2321 }
2322 case AArch64::ORRv16i8:
2323 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2324 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2325 "invalid ORRv16i8 operands");
2326 return true;
2327 }
2328 break;
2329 }
2330 return false;
2331 }
2332
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2333 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2334 int &FrameIndex) const {
2335 switch (MI.getOpcode()) {
2336 default:
2337 break;
2338 case AArch64::LDRWui:
2339 case AArch64::LDRXui:
2340 case AArch64::LDRBui:
2341 case AArch64::LDRHui:
2342 case AArch64::LDRSui:
2343 case AArch64::LDRDui:
2344 case AArch64::LDRQui:
2345 case AArch64::LDR_PXI:
2346 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2347 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2348 FrameIndex = MI.getOperand(1).getIndex();
2349 return MI.getOperand(0).getReg();
2350 }
2351 break;
2352 }
2353
2354 return 0;
2355 }
2356
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2357 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2358 int &FrameIndex) const {
2359 switch (MI.getOpcode()) {
2360 default:
2361 break;
2362 case AArch64::STRWui:
2363 case AArch64::STRXui:
2364 case AArch64::STRBui:
2365 case AArch64::STRHui:
2366 case AArch64::STRSui:
2367 case AArch64::STRDui:
2368 case AArch64::STRQui:
2369 case AArch64::STR_PXI:
2370 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2371 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2372 FrameIndex = MI.getOperand(1).getIndex();
2373 return MI.getOperand(0).getReg();
2374 }
2375 break;
2376 }
2377 return 0;
2378 }
2379
2380 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2381 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2382 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2383 return MMO->getFlags() & MOSuppressPair;
2384 });
2385 }
2386
2387 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2388 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2389 if (MI.memoperands_empty())
2390 return;
2391 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2392 }
2393
2394 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2395 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2396 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2397 return MMO->getFlags() & MOStridedAccess;
2398 });
2399 }
2400
hasUnscaledLdStOffset(unsigned Opc)2401 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2402 switch (Opc) {
2403 default:
2404 return false;
2405 case AArch64::STURSi:
2406 case AArch64::STRSpre:
2407 case AArch64::STURDi:
2408 case AArch64::STRDpre:
2409 case AArch64::STURQi:
2410 case AArch64::STRQpre:
2411 case AArch64::STURBBi:
2412 case AArch64::STURHHi:
2413 case AArch64::STURWi:
2414 case AArch64::STRWpre:
2415 case AArch64::STURXi:
2416 case AArch64::STRXpre:
2417 case AArch64::LDURSi:
2418 case AArch64::LDRSpre:
2419 case AArch64::LDURDi:
2420 case AArch64::LDRDpre:
2421 case AArch64::LDURQi:
2422 case AArch64::LDRQpre:
2423 case AArch64::LDURWi:
2424 case AArch64::LDRWpre:
2425 case AArch64::LDURXi:
2426 case AArch64::LDRXpre:
2427 case AArch64::LDRSWpre:
2428 case AArch64::LDURSWi:
2429 case AArch64::LDURHHi:
2430 case AArch64::LDURBBi:
2431 case AArch64::LDURSBWi:
2432 case AArch64::LDURSHWi:
2433 return true;
2434 }
2435 }
2436
getUnscaledLdSt(unsigned Opc)2437 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2438 switch (Opc) {
2439 default: return {};
2440 case AArch64::PRFMui: return AArch64::PRFUMi;
2441 case AArch64::LDRXui: return AArch64::LDURXi;
2442 case AArch64::LDRWui: return AArch64::LDURWi;
2443 case AArch64::LDRBui: return AArch64::LDURBi;
2444 case AArch64::LDRHui: return AArch64::LDURHi;
2445 case AArch64::LDRSui: return AArch64::LDURSi;
2446 case AArch64::LDRDui: return AArch64::LDURDi;
2447 case AArch64::LDRQui: return AArch64::LDURQi;
2448 case AArch64::LDRBBui: return AArch64::LDURBBi;
2449 case AArch64::LDRHHui: return AArch64::LDURHHi;
2450 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2451 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2452 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2453 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2454 case AArch64::LDRSWui: return AArch64::LDURSWi;
2455 case AArch64::STRXui: return AArch64::STURXi;
2456 case AArch64::STRWui: return AArch64::STURWi;
2457 case AArch64::STRBui: return AArch64::STURBi;
2458 case AArch64::STRHui: return AArch64::STURHi;
2459 case AArch64::STRSui: return AArch64::STURSi;
2460 case AArch64::STRDui: return AArch64::STURDi;
2461 case AArch64::STRQui: return AArch64::STURQi;
2462 case AArch64::STRBBui: return AArch64::STURBBi;
2463 case AArch64::STRHHui: return AArch64::STURHHi;
2464 }
2465 }
2466
getLoadStoreImmIdx(unsigned Opc)2467 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2468 switch (Opc) {
2469 default:
2470 llvm_unreachable("Unhandled Opcode in getLoadStoreImmIdx");
2471 case AArch64::ADDG:
2472 case AArch64::LDAPURBi:
2473 case AArch64::LDAPURHi:
2474 case AArch64::LDAPURi:
2475 case AArch64::LDAPURSBWi:
2476 case AArch64::LDAPURSBXi:
2477 case AArch64::LDAPURSHWi:
2478 case AArch64::LDAPURSHXi:
2479 case AArch64::LDAPURSWi:
2480 case AArch64::LDAPURXi:
2481 case AArch64::LDR_PPXI:
2482 case AArch64::LDR_PXI:
2483 case AArch64::LDR_ZXI:
2484 case AArch64::LDR_ZZXI:
2485 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
2486 case AArch64::LDR_ZZZXI:
2487 case AArch64::LDR_ZZZZXI:
2488 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
2489 case AArch64::LDRBBui:
2490 case AArch64::LDRBui:
2491 case AArch64::LDRDui:
2492 case AArch64::LDRHHui:
2493 case AArch64::LDRHui:
2494 case AArch64::LDRQui:
2495 case AArch64::LDRSBWui:
2496 case AArch64::LDRSBXui:
2497 case AArch64::LDRSHWui:
2498 case AArch64::LDRSHXui:
2499 case AArch64::LDRSui:
2500 case AArch64::LDRSWui:
2501 case AArch64::LDRWui:
2502 case AArch64::LDRXui:
2503 case AArch64::LDURBBi:
2504 case AArch64::LDURBi:
2505 case AArch64::LDURDi:
2506 case AArch64::LDURHHi:
2507 case AArch64::LDURHi:
2508 case AArch64::LDURQi:
2509 case AArch64::LDURSBWi:
2510 case AArch64::LDURSBXi:
2511 case AArch64::LDURSHWi:
2512 case AArch64::LDURSHXi:
2513 case AArch64::LDURSi:
2514 case AArch64::LDURSWi:
2515 case AArch64::LDURWi:
2516 case AArch64::LDURXi:
2517 case AArch64::PRFMui:
2518 case AArch64::PRFUMi:
2519 case AArch64::ST2Gi:
2520 case AArch64::STGi:
2521 case AArch64::STLURBi:
2522 case AArch64::STLURHi:
2523 case AArch64::STLURWi:
2524 case AArch64::STLURXi:
2525 case AArch64::StoreSwiftAsyncContext:
2526 case AArch64::STR_PPXI:
2527 case AArch64::STR_PXI:
2528 case AArch64::STR_ZXI:
2529 case AArch64::STR_ZZXI:
2530 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
2531 case AArch64::STR_ZZZXI:
2532 case AArch64::STR_ZZZZXI:
2533 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
2534 case AArch64::STRBBui:
2535 case AArch64::STRBui:
2536 case AArch64::STRDui:
2537 case AArch64::STRHHui:
2538 case AArch64::STRHui:
2539 case AArch64::STRQui:
2540 case AArch64::STRSui:
2541 case AArch64::STRWui:
2542 case AArch64::STRXui:
2543 case AArch64::STURBBi:
2544 case AArch64::STURBi:
2545 case AArch64::STURDi:
2546 case AArch64::STURHHi:
2547 case AArch64::STURHi:
2548 case AArch64::STURQi:
2549 case AArch64::STURSi:
2550 case AArch64::STURWi:
2551 case AArch64::STURXi:
2552 case AArch64::STZ2Gi:
2553 case AArch64::STZGi:
2554 case AArch64::TAGPstack:
2555 case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
2556 case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
2557 return 2;
2558 case AArch64::LD1B_D_IMM:
2559 case AArch64::LD1B_H_IMM:
2560 case AArch64::LD1B_IMM:
2561 case AArch64::LD1B_S_IMM:
2562 case AArch64::LD1D_IMM:
2563 case AArch64::LD1H_D_IMM:
2564 case AArch64::LD1H_IMM:
2565 case AArch64::LD1H_S_IMM:
2566 case AArch64::LD1RB_D_IMM:
2567 case AArch64::LD1RB_H_IMM:
2568 case AArch64::LD1RB_IMM:
2569 case AArch64::LD1RB_S_IMM:
2570 case AArch64::LD1RD_IMM:
2571 case AArch64::LD1RH_D_IMM:
2572 case AArch64::LD1RH_IMM:
2573 case AArch64::LD1RH_S_IMM:
2574 case AArch64::LD1RSB_D_IMM:
2575 case AArch64::LD1RSB_H_IMM:
2576 case AArch64::LD1RSB_S_IMM:
2577 case AArch64::LD1RSH_D_IMM:
2578 case AArch64::LD1RSH_S_IMM:
2579 case AArch64::LD1RSW_IMM:
2580 case AArch64::LD1RW_D_IMM:
2581 case AArch64::LD1RW_IMM:
2582 case AArch64::LD1SB_D_IMM:
2583 case AArch64::LD1SB_H_IMM:
2584 case AArch64::LD1SB_S_IMM:
2585 case AArch64::LD1SH_D_IMM:
2586 case AArch64::LD1SH_S_IMM:
2587 case AArch64::LD1SW_D_IMM:
2588 case AArch64::LD1W_D_IMM:
2589 case AArch64::LD1W_IMM:
2590 case AArch64::LD2B_IMM:
2591 case AArch64::LD2D_IMM:
2592 case AArch64::LD2H_IMM:
2593 case AArch64::LD2W_IMM:
2594 case AArch64::LD3B_IMM:
2595 case AArch64::LD3D_IMM:
2596 case AArch64::LD3H_IMM:
2597 case AArch64::LD3W_IMM:
2598 case AArch64::LD4B_IMM:
2599 case AArch64::LD4D_IMM:
2600 case AArch64::LD4H_IMM:
2601 case AArch64::LD4W_IMM:
2602 case AArch64::LDG:
2603 case AArch64::LDNF1B_D_IMM:
2604 case AArch64::LDNF1B_H_IMM:
2605 case AArch64::LDNF1B_IMM:
2606 case AArch64::LDNF1B_S_IMM:
2607 case AArch64::LDNF1D_IMM:
2608 case AArch64::LDNF1H_D_IMM:
2609 case AArch64::LDNF1H_IMM:
2610 case AArch64::LDNF1H_S_IMM:
2611 case AArch64::LDNF1SB_D_IMM:
2612 case AArch64::LDNF1SB_H_IMM:
2613 case AArch64::LDNF1SB_S_IMM:
2614 case AArch64::LDNF1SH_D_IMM:
2615 case AArch64::LDNF1SH_S_IMM:
2616 case AArch64::LDNF1SW_D_IMM:
2617 case AArch64::LDNF1W_D_IMM:
2618 case AArch64::LDNF1W_IMM:
2619 case AArch64::LDNPDi:
2620 case AArch64::LDNPQi:
2621 case AArch64::LDNPSi:
2622 case AArch64::LDNPWi:
2623 case AArch64::LDNPXi:
2624 case AArch64::LDNT1B_ZRI:
2625 case AArch64::LDNT1D_ZRI:
2626 case AArch64::LDNT1H_ZRI:
2627 case AArch64::LDNT1W_ZRI:
2628 case AArch64::LDPDi:
2629 case AArch64::LDPQi:
2630 case AArch64::LDPSi:
2631 case AArch64::LDPWi:
2632 case AArch64::LDPXi:
2633 case AArch64::LDRBBpost:
2634 case AArch64::LDRBBpre:
2635 case AArch64::LDRBpost:
2636 case AArch64::LDRBpre:
2637 case AArch64::LDRDpost:
2638 case AArch64::LDRDpre:
2639 case AArch64::LDRHHpost:
2640 case AArch64::LDRHHpre:
2641 case AArch64::LDRHpost:
2642 case AArch64::LDRHpre:
2643 case AArch64::LDRQpost:
2644 case AArch64::LDRQpre:
2645 case AArch64::LDRSpost:
2646 case AArch64::LDRSpre:
2647 case AArch64::LDRWpost:
2648 case AArch64::LDRWpre:
2649 case AArch64::LDRXpost:
2650 case AArch64::LDRXpre:
2651 case AArch64::ST1B_D_IMM:
2652 case AArch64::ST1B_H_IMM:
2653 case AArch64::ST1B_IMM:
2654 case AArch64::ST1B_S_IMM:
2655 case AArch64::ST1D_IMM:
2656 case AArch64::ST1H_D_IMM:
2657 case AArch64::ST1H_IMM:
2658 case AArch64::ST1H_S_IMM:
2659 case AArch64::ST1W_D_IMM:
2660 case AArch64::ST1W_IMM:
2661 case AArch64::ST2B_IMM:
2662 case AArch64::ST2D_IMM:
2663 case AArch64::ST2H_IMM:
2664 case AArch64::ST2W_IMM:
2665 case AArch64::ST3B_IMM:
2666 case AArch64::ST3D_IMM:
2667 case AArch64::ST3H_IMM:
2668 case AArch64::ST3W_IMM:
2669 case AArch64::ST4B_IMM:
2670 case AArch64::ST4D_IMM:
2671 case AArch64::ST4H_IMM:
2672 case AArch64::ST4W_IMM:
2673 case AArch64::STGPi:
2674 case AArch64::STGPreIndex:
2675 case AArch64::STZGPreIndex:
2676 case AArch64::ST2GPreIndex:
2677 case AArch64::STZ2GPreIndex:
2678 case AArch64::STGPostIndex:
2679 case AArch64::STZGPostIndex:
2680 case AArch64::ST2GPostIndex:
2681 case AArch64::STZ2GPostIndex:
2682 case AArch64::STNPDi:
2683 case AArch64::STNPQi:
2684 case AArch64::STNPSi:
2685 case AArch64::STNPWi:
2686 case AArch64::STNPXi:
2687 case AArch64::STNT1B_ZRI:
2688 case AArch64::STNT1D_ZRI:
2689 case AArch64::STNT1H_ZRI:
2690 case AArch64::STNT1W_ZRI:
2691 case AArch64::STPDi:
2692 case AArch64::STPQi:
2693 case AArch64::STPSi:
2694 case AArch64::STPWi:
2695 case AArch64::STPXi:
2696 case AArch64::STRBBpost:
2697 case AArch64::STRBBpre:
2698 case AArch64::STRBpost:
2699 case AArch64::STRBpre:
2700 case AArch64::STRDpost:
2701 case AArch64::STRDpre:
2702 case AArch64::STRHHpost:
2703 case AArch64::STRHHpre:
2704 case AArch64::STRHpost:
2705 case AArch64::STRHpre:
2706 case AArch64::STRQpost:
2707 case AArch64::STRQpre:
2708 case AArch64::STRSpost:
2709 case AArch64::STRSpre:
2710 case AArch64::STRWpost:
2711 case AArch64::STRWpre:
2712 case AArch64::STRXpost:
2713 case AArch64::STRXpre:
2714 return 3;
2715 case AArch64::LDPDpost:
2716 case AArch64::LDPDpre:
2717 case AArch64::LDPQpost:
2718 case AArch64::LDPQpre:
2719 case AArch64::LDPSpost:
2720 case AArch64::LDPSpre:
2721 case AArch64::LDPWpost:
2722 case AArch64::LDPWpre:
2723 case AArch64::LDPXpost:
2724 case AArch64::LDPXpre:
2725 case AArch64::STGPpre:
2726 case AArch64::STGPpost:
2727 case AArch64::STPDpost:
2728 case AArch64::STPDpre:
2729 case AArch64::STPQpost:
2730 case AArch64::STPQpre:
2731 case AArch64::STPSpost:
2732 case AArch64::STPSpre:
2733 case AArch64::STPWpost:
2734 case AArch64::STPWpre:
2735 case AArch64::STPXpost:
2736 case AArch64::STPXpre:
2737 return 4;
2738 }
2739 }
2740
isPairableLdStInst(const MachineInstr & MI)2741 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2742 switch (MI.getOpcode()) {
2743 default:
2744 return false;
2745 // Scaled instructions.
2746 case AArch64::STRSui:
2747 case AArch64::STRDui:
2748 case AArch64::STRQui:
2749 case AArch64::STRXui:
2750 case AArch64::STRWui:
2751 case AArch64::LDRSui:
2752 case AArch64::LDRDui:
2753 case AArch64::LDRQui:
2754 case AArch64::LDRXui:
2755 case AArch64::LDRWui:
2756 case AArch64::LDRSWui:
2757 // Unscaled instructions.
2758 case AArch64::STURSi:
2759 case AArch64::STRSpre:
2760 case AArch64::STURDi:
2761 case AArch64::STRDpre:
2762 case AArch64::STURQi:
2763 case AArch64::STRQpre:
2764 case AArch64::STURWi:
2765 case AArch64::STRWpre:
2766 case AArch64::STURXi:
2767 case AArch64::STRXpre:
2768 case AArch64::LDURSi:
2769 case AArch64::LDRSpre:
2770 case AArch64::LDURDi:
2771 case AArch64::LDRDpre:
2772 case AArch64::LDURQi:
2773 case AArch64::LDRQpre:
2774 case AArch64::LDURWi:
2775 case AArch64::LDRWpre:
2776 case AArch64::LDURXi:
2777 case AArch64::LDRXpre:
2778 case AArch64::LDURSWi:
2779 case AArch64::LDRSWpre:
2780 // SVE instructions.
2781 case AArch64::LDR_ZXI:
2782 case AArch64::STR_ZXI:
2783 return true;
2784 }
2785 }
2786
isTailCallReturnInst(const MachineInstr & MI)2787 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2788 switch (MI.getOpcode()) {
2789 default:
2790 assert((!MI.isCall() || !MI.isReturn()) &&
2791 "Unexpected instruction - was a new tail call opcode introduced?");
2792 return false;
2793 case AArch64::TCRETURNdi:
2794 case AArch64::TCRETURNri:
2795 case AArch64::TCRETURNrix16x17:
2796 case AArch64::TCRETURNrix17:
2797 case AArch64::TCRETURNrinotx16:
2798 case AArch64::TCRETURNriALL:
2799 case AArch64::AUTH_TCRETURN:
2800 case AArch64::AUTH_TCRETURN_BTI:
2801 return true;
2802 }
2803 }
2804
convertToFlagSettingOpc(unsigned Opc)2805 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2806 switch (Opc) {
2807 default:
2808 llvm_unreachable("Opcode has no flag setting equivalent!");
2809 // 32-bit cases:
2810 case AArch64::ADDWri:
2811 return AArch64::ADDSWri;
2812 case AArch64::ADDWrr:
2813 return AArch64::ADDSWrr;
2814 case AArch64::ADDWrs:
2815 return AArch64::ADDSWrs;
2816 case AArch64::ADDWrx:
2817 return AArch64::ADDSWrx;
2818 case AArch64::ANDWri:
2819 return AArch64::ANDSWri;
2820 case AArch64::ANDWrr:
2821 return AArch64::ANDSWrr;
2822 case AArch64::ANDWrs:
2823 return AArch64::ANDSWrs;
2824 case AArch64::BICWrr:
2825 return AArch64::BICSWrr;
2826 case AArch64::BICWrs:
2827 return AArch64::BICSWrs;
2828 case AArch64::SUBWri:
2829 return AArch64::SUBSWri;
2830 case AArch64::SUBWrr:
2831 return AArch64::SUBSWrr;
2832 case AArch64::SUBWrs:
2833 return AArch64::SUBSWrs;
2834 case AArch64::SUBWrx:
2835 return AArch64::SUBSWrx;
2836 // 64-bit cases:
2837 case AArch64::ADDXri:
2838 return AArch64::ADDSXri;
2839 case AArch64::ADDXrr:
2840 return AArch64::ADDSXrr;
2841 case AArch64::ADDXrs:
2842 return AArch64::ADDSXrs;
2843 case AArch64::ADDXrx:
2844 return AArch64::ADDSXrx;
2845 case AArch64::ANDXri:
2846 return AArch64::ANDSXri;
2847 case AArch64::ANDXrr:
2848 return AArch64::ANDSXrr;
2849 case AArch64::ANDXrs:
2850 return AArch64::ANDSXrs;
2851 case AArch64::BICXrr:
2852 return AArch64::BICSXrr;
2853 case AArch64::BICXrs:
2854 return AArch64::BICSXrs;
2855 case AArch64::SUBXri:
2856 return AArch64::SUBSXri;
2857 case AArch64::SUBXrr:
2858 return AArch64::SUBSXrr;
2859 case AArch64::SUBXrs:
2860 return AArch64::SUBSXrs;
2861 case AArch64::SUBXrx:
2862 return AArch64::SUBSXrx;
2863 // SVE instructions:
2864 case AArch64::AND_PPzPP:
2865 return AArch64::ANDS_PPzPP;
2866 case AArch64::BIC_PPzPP:
2867 return AArch64::BICS_PPzPP;
2868 case AArch64::EOR_PPzPP:
2869 return AArch64::EORS_PPzPP;
2870 case AArch64::NAND_PPzPP:
2871 return AArch64::NANDS_PPzPP;
2872 case AArch64::NOR_PPzPP:
2873 return AArch64::NORS_PPzPP;
2874 case AArch64::ORN_PPzPP:
2875 return AArch64::ORNS_PPzPP;
2876 case AArch64::ORR_PPzPP:
2877 return AArch64::ORRS_PPzPP;
2878 case AArch64::BRKA_PPzP:
2879 return AArch64::BRKAS_PPzP;
2880 case AArch64::BRKPA_PPzPP:
2881 return AArch64::BRKPAS_PPzPP;
2882 case AArch64::BRKB_PPzP:
2883 return AArch64::BRKBS_PPzP;
2884 case AArch64::BRKPB_PPzPP:
2885 return AArch64::BRKPBS_PPzPP;
2886 case AArch64::BRKN_PPzP:
2887 return AArch64::BRKNS_PPzP;
2888 case AArch64::RDFFR_PPz:
2889 return AArch64::RDFFRS_PPz;
2890 case AArch64::PTRUE_B:
2891 return AArch64::PTRUES_B;
2892 }
2893 }
2894
2895 // Is this a candidate for ld/st merging or pairing? For example, we don't
2896 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2897 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2898
2899 bool IsPreLdSt = isPreLdSt(MI);
2900
2901 // If this is a volatile load/store, don't mess with it.
2902 if (MI.hasOrderedMemoryRef())
2903 return false;
2904
2905 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2906 // For Pre-inc LD/ST, the operand is shifted by one.
2907 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2908 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2909 "Expected a reg or frame index operand.");
2910
2911 // For Pre-indexed addressing quadword instructions, the third operand is the
2912 // immediate value.
2913 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2914
2915 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2916 return false;
2917
2918 // Can't merge/pair if the instruction modifies the base register.
2919 // e.g., ldr x0, [x0]
2920 // This case will never occur with an FI base.
2921 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2922 // STR<S,D,Q,W,X>pre, it can be merged.
2923 // For example:
2924 // ldr q0, [x11, #32]!
2925 // ldr q1, [x11, #16]
2926 // to
2927 // ldp q0, q1, [x11, #32]!
2928 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2929 Register BaseReg = MI.getOperand(1).getReg();
2930 const TargetRegisterInfo *TRI = &getRegisterInfo();
2931 if (MI.modifiesRegister(BaseReg, TRI))
2932 return false;
2933 }
2934
2935 // Pairing SVE fills/spills is only valid for little-endian targets that
2936 // implement VLS 128.
2937 switch (MI.getOpcode()) {
2938 default:
2939 break;
2940 case AArch64::LDR_ZXI:
2941 case AArch64::STR_ZXI:
2942 if (!Subtarget.isLittleEndian() ||
2943 Subtarget.getSVEVectorSizeInBits() != 128)
2944 return false;
2945 }
2946
2947 // Check if this load/store has a hint to avoid pair formation.
2948 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2949 if (isLdStPairSuppressed(MI))
2950 return false;
2951
2952 // Do not pair any callee-save store/reload instructions in the
2953 // prologue/epilogue if the CFI information encoded the operations as separate
2954 // instructions, as that will cause the size of the actual prologue to mismatch
2955 // with the prologue size recorded in the Windows CFI.
2956 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2957 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2958 MI.getMF()->getFunction().needsUnwindTableEntry();
2959 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2960 MI.getFlag(MachineInstr::FrameDestroy)))
2961 return false;
2962
2963 // On some CPUs quad load/store pairs are slower than two single load/stores.
2964 if (Subtarget.isPaired128Slow()) {
2965 switch (MI.getOpcode()) {
2966 default:
2967 break;
2968 case AArch64::LDURQi:
2969 case AArch64::STURQi:
2970 case AArch64::LDRQui:
2971 case AArch64::STRQui:
2972 return false;
2973 }
2974 }
2975
2976 return true;
2977 }
2978
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,LocationSize & Width,const TargetRegisterInfo * TRI) const2979 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2980 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2981 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2982 const TargetRegisterInfo *TRI) const {
2983 if (!LdSt.mayLoadOrStore())
2984 return false;
2985
2986 const MachineOperand *BaseOp;
2987 TypeSize WidthN(0, false);
2988 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2989 WidthN, TRI))
2990 return false;
2991 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2992 // vector.
2993 Width = LocationSize::precise(WidthN);
2994 BaseOps.push_back(BaseOp);
2995 return true;
2996 }
2997
2998 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2999 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
3000 const TargetRegisterInfo *TRI) const {
3001 const MachineOperand *Base; // Filled with the base operand of MI.
3002 int64_t Offset; // Filled with the offset of MI.
3003 bool OffsetIsScalable;
3004 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
3005 return std::nullopt;
3006
3007 if (!Base->isReg())
3008 return std::nullopt;
3009 ExtAddrMode AM;
3010 AM.BaseReg = Base->getReg();
3011 AM.Displacement = Offset;
3012 AM.ScaledReg = 0;
3013 AM.Scale = 0;
3014 return AM;
3015 }
3016
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const3017 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
3018 Register Reg,
3019 const MachineInstr &AddrI,
3020 ExtAddrMode &AM) const {
3021 // Filter out instructions into which we cannot fold.
3022 unsigned NumBytes;
3023 int64_t OffsetScale = 1;
3024 switch (MemI.getOpcode()) {
3025 default:
3026 return false;
3027
3028 case AArch64::LDURQi:
3029 case AArch64::STURQi:
3030 NumBytes = 16;
3031 break;
3032
3033 case AArch64::LDURDi:
3034 case AArch64::STURDi:
3035 case AArch64::LDURXi:
3036 case AArch64::STURXi:
3037 NumBytes = 8;
3038 break;
3039
3040 case AArch64::LDURWi:
3041 case AArch64::LDURSWi:
3042 case AArch64::STURWi:
3043 NumBytes = 4;
3044 break;
3045
3046 case AArch64::LDURHi:
3047 case AArch64::STURHi:
3048 case AArch64::LDURHHi:
3049 case AArch64::STURHHi:
3050 case AArch64::LDURSHXi:
3051 case AArch64::LDURSHWi:
3052 NumBytes = 2;
3053 break;
3054
3055 case AArch64::LDRBroX:
3056 case AArch64::LDRBBroX:
3057 case AArch64::LDRSBXroX:
3058 case AArch64::LDRSBWroX:
3059 case AArch64::STRBroX:
3060 case AArch64::STRBBroX:
3061 case AArch64::LDURBi:
3062 case AArch64::LDURBBi:
3063 case AArch64::LDURSBXi:
3064 case AArch64::LDURSBWi:
3065 case AArch64::STURBi:
3066 case AArch64::STURBBi:
3067 case AArch64::LDRBui:
3068 case AArch64::LDRBBui:
3069 case AArch64::LDRSBXui:
3070 case AArch64::LDRSBWui:
3071 case AArch64::STRBui:
3072 case AArch64::STRBBui:
3073 NumBytes = 1;
3074 break;
3075
3076 case AArch64::LDRQroX:
3077 case AArch64::STRQroX:
3078 case AArch64::LDRQui:
3079 case AArch64::STRQui:
3080 NumBytes = 16;
3081 OffsetScale = 16;
3082 break;
3083
3084 case AArch64::LDRDroX:
3085 case AArch64::STRDroX:
3086 case AArch64::LDRXroX:
3087 case AArch64::STRXroX:
3088 case AArch64::LDRDui:
3089 case AArch64::STRDui:
3090 case AArch64::LDRXui:
3091 case AArch64::STRXui:
3092 NumBytes = 8;
3093 OffsetScale = 8;
3094 break;
3095
3096 case AArch64::LDRWroX:
3097 case AArch64::LDRSWroX:
3098 case AArch64::STRWroX:
3099 case AArch64::LDRWui:
3100 case AArch64::LDRSWui:
3101 case AArch64::STRWui:
3102 NumBytes = 4;
3103 OffsetScale = 4;
3104 break;
3105
3106 case AArch64::LDRHroX:
3107 case AArch64::STRHroX:
3108 case AArch64::LDRHHroX:
3109 case AArch64::STRHHroX:
3110 case AArch64::LDRSHXroX:
3111 case AArch64::LDRSHWroX:
3112 case AArch64::LDRHui:
3113 case AArch64::STRHui:
3114 case AArch64::LDRHHui:
3115 case AArch64::STRHHui:
3116 case AArch64::LDRSHXui:
3117 case AArch64::LDRSHWui:
3118 NumBytes = 2;
3119 OffsetScale = 2;
3120 break;
3121 }
3122
3123 // Check the fold operand is not the loaded/stored value.
3124 const MachineOperand &BaseRegOp = MemI.getOperand(0);
3125 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
3126 return false;
3127
3128 // Handle memory instructions with a [Reg, Reg] addressing mode.
3129 if (MemI.getOperand(2).isReg()) {
3130 // Bail if the addressing mode already includes extension of the offset
3131 // register.
3132 if (MemI.getOperand(3).getImm())
3133 return false;
3134
3135 // Check if we actually have a scaled offset.
3136 if (MemI.getOperand(4).getImm() == 0)
3137 OffsetScale = 1;
3138
3139 // If the address instructions is folded into the base register, then the
3140 // addressing mode must not have a scale. Then we can swap the base and the
3141 // scaled registers.
3142 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
3143 return false;
3144
3145 switch (AddrI.getOpcode()) {
3146 default:
3147 return false;
3148
3149 case AArch64::SBFMXri:
3150 // sxtw Xa, Wm
3151 // ldr Xd, [Xn, Xa, lsl #N]
3152 // ->
3153 // ldr Xd, [Xn, Wm, sxtw #N]
3154 if (AddrI.getOperand(2).getImm() != 0 ||
3155 AddrI.getOperand(3).getImm() != 31)
3156 return false;
3157
3158 AM.BaseReg = MemI.getOperand(1).getReg();
3159 if (AM.BaseReg == Reg)
3160 AM.BaseReg = MemI.getOperand(2).getReg();
3161 AM.ScaledReg = AddrI.getOperand(1).getReg();
3162 AM.Scale = OffsetScale;
3163 AM.Displacement = 0;
3164 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
3165 return true;
3166
3167 case TargetOpcode::SUBREG_TO_REG: {
3168 // mov Wa, Wm
3169 // ldr Xd, [Xn, Xa, lsl #N]
3170 // ->
3171 // ldr Xd, [Xn, Wm, uxtw #N]
3172
3173 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
3174 if (AddrI.getOperand(1).getImm() != 0 ||
3175 AddrI.getOperand(3).getImm() != AArch64::sub_32)
3176 return false;
3177
3178 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
3179 Register OffsetReg = AddrI.getOperand(2).getReg();
3180 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
3181 return false;
3182
3183 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
3184 if (DefMI.getOpcode() != AArch64::ORRWrs ||
3185 DefMI.getOperand(1).getReg() != AArch64::WZR ||
3186 DefMI.getOperand(3).getImm() != 0)
3187 return false;
3188
3189 AM.BaseReg = MemI.getOperand(1).getReg();
3190 if (AM.BaseReg == Reg)
3191 AM.BaseReg = MemI.getOperand(2).getReg();
3192 AM.ScaledReg = DefMI.getOperand(2).getReg();
3193 AM.Scale = OffsetScale;
3194 AM.Displacement = 0;
3195 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
3196 return true;
3197 }
3198 }
3199 }
3200
3201 // Handle memory instructions with a [Reg, #Imm] addressing mode.
3202
3203 // Check we are not breaking a potential conversion to an LDP.
3204 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
3205 int64_t NewOffset) -> bool {
3206 int64_t MinOffset, MaxOffset;
3207 switch (NumBytes) {
3208 default:
3209 return true;
3210 case 4:
3211 MinOffset = -256;
3212 MaxOffset = 252;
3213 break;
3214 case 8:
3215 MinOffset = -512;
3216 MaxOffset = 504;
3217 break;
3218 case 16:
3219 MinOffset = -1024;
3220 MaxOffset = 1008;
3221 break;
3222 }
3223 return OldOffset < MinOffset || OldOffset > MaxOffset ||
3224 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
3225 };
3226 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
3227 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
3228 int64_t NewOffset = OldOffset + Disp;
3229 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
3230 return false;
3231 // If the old offset would fit into an LDP, but the new offset wouldn't,
3232 // bail out.
3233 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
3234 return false;
3235 AM.BaseReg = AddrI.getOperand(1).getReg();
3236 AM.ScaledReg = 0;
3237 AM.Scale = 0;
3238 AM.Displacement = NewOffset;
3239 AM.Form = ExtAddrMode::Formula::Basic;
3240 return true;
3241 };
3242
3243 auto canFoldAddRegIntoAddrMode =
3244 [&](int64_t Scale,
3245 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
3246 if (MemI.getOperand(2).getImm() != 0)
3247 return false;
3248 if ((unsigned)Scale != Scale)
3249 return false;
3250 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
3251 return false;
3252 AM.BaseReg = AddrI.getOperand(1).getReg();
3253 AM.ScaledReg = AddrI.getOperand(2).getReg();
3254 AM.Scale = Scale;
3255 AM.Displacement = 0;
3256 AM.Form = Form;
3257 return true;
3258 };
3259
3260 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
3261 unsigned Opcode = MemI.getOpcode();
3262 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
3263 Subtarget.isSTRQroSlow();
3264 };
3265
3266 int64_t Disp = 0;
3267 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
3268 switch (AddrI.getOpcode()) {
3269 default:
3270 return false;
3271
3272 case AArch64::ADDXri:
3273 // add Xa, Xn, #N
3274 // ldr Xd, [Xa, #M]
3275 // ->
3276 // ldr Xd, [Xn, #N'+M]
3277 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3278 return canFoldAddSubImmIntoAddrMode(Disp);
3279
3280 case AArch64::SUBXri:
3281 // sub Xa, Xn, #N
3282 // ldr Xd, [Xa, #M]
3283 // ->
3284 // ldr Xd, [Xn, #N'+M]
3285 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3286 return canFoldAddSubImmIntoAddrMode(-Disp);
3287
3288 case AArch64::ADDXrs: {
3289 // add Xa, Xn, Xm, lsl #N
3290 // ldr Xd, [Xa]
3291 // ->
3292 // ldr Xd, [Xn, Xm, lsl #N]
3293
3294 // Don't fold the add if the result would be slower, unless optimising for
3295 // size.
3296 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3297 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3298 return false;
3299 Shift = AArch64_AM::getShiftValue(Shift);
3300 if (!OptSize) {
3301 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3302 return false;
3303 if (avoidSlowSTRQ(MemI))
3304 return false;
3305 }
3306 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3307 }
3308
3309 case AArch64::ADDXrr:
3310 // add Xa, Xn, Xm
3311 // ldr Xd, [Xa]
3312 // ->
3313 // ldr Xd, [Xn, Xm, lsl #0]
3314
3315 // Don't fold the add if the result would be slower, unless optimising for
3316 // size.
3317 if (!OptSize && avoidSlowSTRQ(MemI))
3318 return false;
3319 return canFoldAddRegIntoAddrMode(1);
3320
3321 case AArch64::ADDXrx:
3322 // add Xa, Xn, Wm, {s,u}xtw #N
3323 // ldr Xd, [Xa]
3324 // ->
3325 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3326
3327 // Don't fold the add if the result would be slower, unless optimising for
3328 // size.
3329 if (!OptSize && avoidSlowSTRQ(MemI))
3330 return false;
3331
3332 // Can fold only sign-/zero-extend of a word.
3333 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3334 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3335 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3336 return false;
3337
3338 return canFoldAddRegIntoAddrMode(
3339 1ULL << AArch64_AM::getArithShiftValue(Imm),
3340 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3341 : ExtAddrMode::Formula::ZExtScaledReg);
3342 }
3343 }
3344
3345 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3346 // return the opcode of an instruction performing the same operation, but using
3347 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3348 static unsigned regOffsetOpcode(unsigned Opcode) {
3349 switch (Opcode) {
3350 default:
3351 llvm_unreachable("Address folding not implemented for instruction");
3352
3353 case AArch64::LDURQi:
3354 case AArch64::LDRQui:
3355 return AArch64::LDRQroX;
3356 case AArch64::STURQi:
3357 case AArch64::STRQui:
3358 return AArch64::STRQroX;
3359 case AArch64::LDURDi:
3360 case AArch64::LDRDui:
3361 return AArch64::LDRDroX;
3362 case AArch64::STURDi:
3363 case AArch64::STRDui:
3364 return AArch64::STRDroX;
3365 case AArch64::LDURXi:
3366 case AArch64::LDRXui:
3367 return AArch64::LDRXroX;
3368 case AArch64::STURXi:
3369 case AArch64::STRXui:
3370 return AArch64::STRXroX;
3371 case AArch64::LDURWi:
3372 case AArch64::LDRWui:
3373 return AArch64::LDRWroX;
3374 case AArch64::LDURSWi:
3375 case AArch64::LDRSWui:
3376 return AArch64::LDRSWroX;
3377 case AArch64::STURWi:
3378 case AArch64::STRWui:
3379 return AArch64::STRWroX;
3380 case AArch64::LDURHi:
3381 case AArch64::LDRHui:
3382 return AArch64::LDRHroX;
3383 case AArch64::STURHi:
3384 case AArch64::STRHui:
3385 return AArch64::STRHroX;
3386 case AArch64::LDURHHi:
3387 case AArch64::LDRHHui:
3388 return AArch64::LDRHHroX;
3389 case AArch64::STURHHi:
3390 case AArch64::STRHHui:
3391 return AArch64::STRHHroX;
3392 case AArch64::LDURSHXi:
3393 case AArch64::LDRSHXui:
3394 return AArch64::LDRSHXroX;
3395 case AArch64::LDURSHWi:
3396 case AArch64::LDRSHWui:
3397 return AArch64::LDRSHWroX;
3398 case AArch64::LDURBi:
3399 case AArch64::LDRBui:
3400 return AArch64::LDRBroX;
3401 case AArch64::LDURBBi:
3402 case AArch64::LDRBBui:
3403 return AArch64::LDRBBroX;
3404 case AArch64::LDURSBXi:
3405 case AArch64::LDRSBXui:
3406 return AArch64::LDRSBXroX;
3407 case AArch64::LDURSBWi:
3408 case AArch64::LDRSBWui:
3409 return AArch64::LDRSBWroX;
3410 case AArch64::STURBi:
3411 case AArch64::STRBui:
3412 return AArch64::STRBroX;
3413 case AArch64::STURBBi:
3414 case AArch64::STRBBui:
3415 return AArch64::STRBBroX;
3416 }
3417 }
3418
3419 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3420 // the opcode of an instruction performing the same operation, but using the
3421 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3422 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3423 switch (Opcode) {
3424 default:
3425 llvm_unreachable("Address folding not implemented for instruction");
3426
3427 case AArch64::LDURQi:
3428 Scale = 16;
3429 return AArch64::LDRQui;
3430 case AArch64::STURQi:
3431 Scale = 16;
3432 return AArch64::STRQui;
3433 case AArch64::LDURDi:
3434 Scale = 8;
3435 return AArch64::LDRDui;
3436 case AArch64::STURDi:
3437 Scale = 8;
3438 return AArch64::STRDui;
3439 case AArch64::LDURXi:
3440 Scale = 8;
3441 return AArch64::LDRXui;
3442 case AArch64::STURXi:
3443 Scale = 8;
3444 return AArch64::STRXui;
3445 case AArch64::LDURWi:
3446 Scale = 4;
3447 return AArch64::LDRWui;
3448 case AArch64::LDURSWi:
3449 Scale = 4;
3450 return AArch64::LDRSWui;
3451 case AArch64::STURWi:
3452 Scale = 4;
3453 return AArch64::STRWui;
3454 case AArch64::LDURHi:
3455 Scale = 2;
3456 return AArch64::LDRHui;
3457 case AArch64::STURHi:
3458 Scale = 2;
3459 return AArch64::STRHui;
3460 case AArch64::LDURHHi:
3461 Scale = 2;
3462 return AArch64::LDRHHui;
3463 case AArch64::STURHHi:
3464 Scale = 2;
3465 return AArch64::STRHHui;
3466 case AArch64::LDURSHXi:
3467 Scale = 2;
3468 return AArch64::LDRSHXui;
3469 case AArch64::LDURSHWi:
3470 Scale = 2;
3471 return AArch64::LDRSHWui;
3472 case AArch64::LDURBi:
3473 Scale = 1;
3474 return AArch64::LDRBui;
3475 case AArch64::LDURBBi:
3476 Scale = 1;
3477 return AArch64::LDRBBui;
3478 case AArch64::LDURSBXi:
3479 Scale = 1;
3480 return AArch64::LDRSBXui;
3481 case AArch64::LDURSBWi:
3482 Scale = 1;
3483 return AArch64::LDRSBWui;
3484 case AArch64::STURBi:
3485 Scale = 1;
3486 return AArch64::STRBui;
3487 case AArch64::STURBBi:
3488 Scale = 1;
3489 return AArch64::STRBBui;
3490 case AArch64::LDRQui:
3491 case AArch64::STRQui:
3492 Scale = 16;
3493 return Opcode;
3494 case AArch64::LDRDui:
3495 case AArch64::STRDui:
3496 case AArch64::LDRXui:
3497 case AArch64::STRXui:
3498 Scale = 8;
3499 return Opcode;
3500 case AArch64::LDRWui:
3501 case AArch64::LDRSWui:
3502 case AArch64::STRWui:
3503 Scale = 4;
3504 return Opcode;
3505 case AArch64::LDRHui:
3506 case AArch64::STRHui:
3507 case AArch64::LDRHHui:
3508 case AArch64::STRHHui:
3509 case AArch64::LDRSHXui:
3510 case AArch64::LDRSHWui:
3511 Scale = 2;
3512 return Opcode;
3513 case AArch64::LDRBui:
3514 case AArch64::LDRBBui:
3515 case AArch64::LDRSBXui:
3516 case AArch64::LDRSBWui:
3517 case AArch64::STRBui:
3518 case AArch64::STRBBui:
3519 Scale = 1;
3520 return Opcode;
3521 }
3522 }
3523
3524 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3525 // the opcode of an instruction performing the same operation, but using the
3526 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3527 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3528 switch (Opcode) {
3529 default:
3530 llvm_unreachable("Address folding not implemented for instruction");
3531
3532 case AArch64::LDURQi:
3533 case AArch64::STURQi:
3534 case AArch64::LDURDi:
3535 case AArch64::STURDi:
3536 case AArch64::LDURXi:
3537 case AArch64::STURXi:
3538 case AArch64::LDURWi:
3539 case AArch64::LDURSWi:
3540 case AArch64::STURWi:
3541 case AArch64::LDURHi:
3542 case AArch64::STURHi:
3543 case AArch64::LDURHHi:
3544 case AArch64::STURHHi:
3545 case AArch64::LDURSHXi:
3546 case AArch64::LDURSHWi:
3547 case AArch64::LDURBi:
3548 case AArch64::STURBi:
3549 case AArch64::LDURBBi:
3550 case AArch64::STURBBi:
3551 case AArch64::LDURSBWi:
3552 case AArch64::LDURSBXi:
3553 return Opcode;
3554 case AArch64::LDRQui:
3555 return AArch64::LDURQi;
3556 case AArch64::STRQui:
3557 return AArch64::STURQi;
3558 case AArch64::LDRDui:
3559 return AArch64::LDURDi;
3560 case AArch64::STRDui:
3561 return AArch64::STURDi;
3562 case AArch64::LDRXui:
3563 return AArch64::LDURXi;
3564 case AArch64::STRXui:
3565 return AArch64::STURXi;
3566 case AArch64::LDRWui:
3567 return AArch64::LDURWi;
3568 case AArch64::LDRSWui:
3569 return AArch64::LDURSWi;
3570 case AArch64::STRWui:
3571 return AArch64::STURWi;
3572 case AArch64::LDRHui:
3573 return AArch64::LDURHi;
3574 case AArch64::STRHui:
3575 return AArch64::STURHi;
3576 case AArch64::LDRHHui:
3577 return AArch64::LDURHHi;
3578 case AArch64::STRHHui:
3579 return AArch64::STURHHi;
3580 case AArch64::LDRSHXui:
3581 return AArch64::LDURSHXi;
3582 case AArch64::LDRSHWui:
3583 return AArch64::LDURSHWi;
3584 case AArch64::LDRBBui:
3585 return AArch64::LDURBBi;
3586 case AArch64::LDRBui:
3587 return AArch64::LDURBi;
3588 case AArch64::STRBBui:
3589 return AArch64::STURBBi;
3590 case AArch64::STRBui:
3591 return AArch64::STURBi;
3592 case AArch64::LDRSBWui:
3593 return AArch64::LDURSBWi;
3594 case AArch64::LDRSBXui:
3595 return AArch64::LDURSBXi;
3596 }
3597 }
3598
3599 // Given the opcode of a memory load/store instruction, return the opcode of an
3600 // instruction performing the same operation, but using
3601 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3602 // offset register.
offsetExtendOpcode(unsigned Opcode)3603 static unsigned offsetExtendOpcode(unsigned Opcode) {
3604 switch (Opcode) {
3605 default:
3606 llvm_unreachable("Address folding not implemented for instruction");
3607
3608 case AArch64::LDRQroX:
3609 case AArch64::LDURQi:
3610 case AArch64::LDRQui:
3611 return AArch64::LDRQroW;
3612 case AArch64::STRQroX:
3613 case AArch64::STURQi:
3614 case AArch64::STRQui:
3615 return AArch64::STRQroW;
3616 case AArch64::LDRDroX:
3617 case AArch64::LDURDi:
3618 case AArch64::LDRDui:
3619 return AArch64::LDRDroW;
3620 case AArch64::STRDroX:
3621 case AArch64::STURDi:
3622 case AArch64::STRDui:
3623 return AArch64::STRDroW;
3624 case AArch64::LDRXroX:
3625 case AArch64::LDURXi:
3626 case AArch64::LDRXui:
3627 return AArch64::LDRXroW;
3628 case AArch64::STRXroX:
3629 case AArch64::STURXi:
3630 case AArch64::STRXui:
3631 return AArch64::STRXroW;
3632 case AArch64::LDRWroX:
3633 case AArch64::LDURWi:
3634 case AArch64::LDRWui:
3635 return AArch64::LDRWroW;
3636 case AArch64::LDRSWroX:
3637 case AArch64::LDURSWi:
3638 case AArch64::LDRSWui:
3639 return AArch64::LDRSWroW;
3640 case AArch64::STRWroX:
3641 case AArch64::STURWi:
3642 case AArch64::STRWui:
3643 return AArch64::STRWroW;
3644 case AArch64::LDRHroX:
3645 case AArch64::LDURHi:
3646 case AArch64::LDRHui:
3647 return AArch64::LDRHroW;
3648 case AArch64::STRHroX:
3649 case AArch64::STURHi:
3650 case AArch64::STRHui:
3651 return AArch64::STRHroW;
3652 case AArch64::LDRHHroX:
3653 case AArch64::LDURHHi:
3654 case AArch64::LDRHHui:
3655 return AArch64::LDRHHroW;
3656 case AArch64::STRHHroX:
3657 case AArch64::STURHHi:
3658 case AArch64::STRHHui:
3659 return AArch64::STRHHroW;
3660 case AArch64::LDRSHXroX:
3661 case AArch64::LDURSHXi:
3662 case AArch64::LDRSHXui:
3663 return AArch64::LDRSHXroW;
3664 case AArch64::LDRSHWroX:
3665 case AArch64::LDURSHWi:
3666 case AArch64::LDRSHWui:
3667 return AArch64::LDRSHWroW;
3668 case AArch64::LDRBroX:
3669 case AArch64::LDURBi:
3670 case AArch64::LDRBui:
3671 return AArch64::LDRBroW;
3672 case AArch64::LDRBBroX:
3673 case AArch64::LDURBBi:
3674 case AArch64::LDRBBui:
3675 return AArch64::LDRBBroW;
3676 case AArch64::LDRSBXroX:
3677 case AArch64::LDURSBXi:
3678 case AArch64::LDRSBXui:
3679 return AArch64::LDRSBXroW;
3680 case AArch64::LDRSBWroX:
3681 case AArch64::LDURSBWi:
3682 case AArch64::LDRSBWui:
3683 return AArch64::LDRSBWroW;
3684 case AArch64::STRBroX:
3685 case AArch64::STURBi:
3686 case AArch64::STRBui:
3687 return AArch64::STRBroW;
3688 case AArch64::STRBBroX:
3689 case AArch64::STURBBi:
3690 case AArch64::STRBBui:
3691 return AArch64::STRBBroW;
3692 }
3693 }
3694
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3695 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3696 const ExtAddrMode &AM) const {
3697
3698 const DebugLoc &DL = MemI.getDebugLoc();
3699 MachineBasicBlock &MBB = *MemI.getParent();
3700 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3701
3702 if (AM.Form == ExtAddrMode::Formula::Basic) {
3703 if (AM.ScaledReg) {
3704 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3705 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3706 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3707 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3708 .addReg(MemI.getOperand(0).getReg(),
3709 MemI.mayLoad() ? RegState::Define : 0)
3710 .addReg(AM.BaseReg)
3711 .addReg(AM.ScaledReg)
3712 .addImm(0)
3713 .addImm(AM.Scale > 1)
3714 .setMemRefs(MemI.memoperands())
3715 .setMIFlags(MemI.getFlags());
3716 return B.getInstr();
3717 }
3718
3719 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3720 "Addressing mode not supported for folding");
3721
3722 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3723 unsigned Scale = 1;
3724 unsigned Opcode = MemI.getOpcode();
3725 if (isInt<9>(AM.Displacement))
3726 Opcode = unscaledOffsetOpcode(Opcode);
3727 else
3728 Opcode = scaledOffsetOpcode(Opcode, Scale);
3729
3730 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3731 .addReg(MemI.getOperand(0).getReg(),
3732 MemI.mayLoad() ? RegState::Define : 0)
3733 .addReg(AM.BaseReg)
3734 .addImm(AM.Displacement / Scale)
3735 .setMemRefs(MemI.memoperands())
3736 .setMIFlags(MemI.getFlags());
3737 return B.getInstr();
3738 }
3739
3740 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3741 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3742 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3743 assert(AM.ScaledReg && !AM.Displacement &&
3744 "Address offset can be a register or an immediate, but not both");
3745 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3746 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3747 // Make sure the offset register is in the correct register class.
3748 Register OffsetReg = AM.ScaledReg;
3749 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3750 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3751 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3752 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3753 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3754 }
3755 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3756 .addReg(MemI.getOperand(0).getReg(),
3757 MemI.mayLoad() ? RegState::Define : 0)
3758 .addReg(AM.BaseReg)
3759 .addReg(OffsetReg)
3760 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3761 .addImm(AM.Scale != 1)
3762 .setMemRefs(MemI.memoperands())
3763 .setMIFlags(MemI.getFlags());
3764
3765 return B.getInstr();
3766 }
3767
3768 llvm_unreachable(
3769 "Function must not be called with an addressing mode it can't handle");
3770 }
3771
3772 /// Return true if the opcode is a post-index ld/st instruction, which really
3773 /// loads from base+0.
isPostIndexLdStOpcode(unsigned Opcode)3774 static bool isPostIndexLdStOpcode(unsigned Opcode) {
3775 switch (Opcode) {
3776 default:
3777 return false;
3778 case AArch64::LD1Fourv16b_POST:
3779 case AArch64::LD1Fourv1d_POST:
3780 case AArch64::LD1Fourv2d_POST:
3781 case AArch64::LD1Fourv2s_POST:
3782 case AArch64::LD1Fourv4h_POST:
3783 case AArch64::LD1Fourv4s_POST:
3784 case AArch64::LD1Fourv8b_POST:
3785 case AArch64::LD1Fourv8h_POST:
3786 case AArch64::LD1Onev16b_POST:
3787 case AArch64::LD1Onev1d_POST:
3788 case AArch64::LD1Onev2d_POST:
3789 case AArch64::LD1Onev2s_POST:
3790 case AArch64::LD1Onev4h_POST:
3791 case AArch64::LD1Onev4s_POST:
3792 case AArch64::LD1Onev8b_POST:
3793 case AArch64::LD1Onev8h_POST:
3794 case AArch64::LD1Rv16b_POST:
3795 case AArch64::LD1Rv1d_POST:
3796 case AArch64::LD1Rv2d_POST:
3797 case AArch64::LD1Rv2s_POST:
3798 case AArch64::LD1Rv4h_POST:
3799 case AArch64::LD1Rv4s_POST:
3800 case AArch64::LD1Rv8b_POST:
3801 case AArch64::LD1Rv8h_POST:
3802 case AArch64::LD1Threev16b_POST:
3803 case AArch64::LD1Threev1d_POST:
3804 case AArch64::LD1Threev2d_POST:
3805 case AArch64::LD1Threev2s_POST:
3806 case AArch64::LD1Threev4h_POST:
3807 case AArch64::LD1Threev4s_POST:
3808 case AArch64::LD1Threev8b_POST:
3809 case AArch64::LD1Threev8h_POST:
3810 case AArch64::LD1Twov16b_POST:
3811 case AArch64::LD1Twov1d_POST:
3812 case AArch64::LD1Twov2d_POST:
3813 case AArch64::LD1Twov2s_POST:
3814 case AArch64::LD1Twov4h_POST:
3815 case AArch64::LD1Twov4s_POST:
3816 case AArch64::LD1Twov8b_POST:
3817 case AArch64::LD1Twov8h_POST:
3818 case AArch64::LD1i16_POST:
3819 case AArch64::LD1i32_POST:
3820 case AArch64::LD1i64_POST:
3821 case AArch64::LD1i8_POST:
3822 case AArch64::LD2Rv16b_POST:
3823 case AArch64::LD2Rv1d_POST:
3824 case AArch64::LD2Rv2d_POST:
3825 case AArch64::LD2Rv2s_POST:
3826 case AArch64::LD2Rv4h_POST:
3827 case AArch64::LD2Rv4s_POST:
3828 case AArch64::LD2Rv8b_POST:
3829 case AArch64::LD2Rv8h_POST:
3830 case AArch64::LD2Twov16b_POST:
3831 case AArch64::LD2Twov2d_POST:
3832 case AArch64::LD2Twov2s_POST:
3833 case AArch64::LD2Twov4h_POST:
3834 case AArch64::LD2Twov4s_POST:
3835 case AArch64::LD2Twov8b_POST:
3836 case AArch64::LD2Twov8h_POST:
3837 case AArch64::LD2i16_POST:
3838 case AArch64::LD2i32_POST:
3839 case AArch64::LD2i64_POST:
3840 case AArch64::LD2i8_POST:
3841 case AArch64::LD3Rv16b_POST:
3842 case AArch64::LD3Rv1d_POST:
3843 case AArch64::LD3Rv2d_POST:
3844 case AArch64::LD3Rv2s_POST:
3845 case AArch64::LD3Rv4h_POST:
3846 case AArch64::LD3Rv4s_POST:
3847 case AArch64::LD3Rv8b_POST:
3848 case AArch64::LD3Rv8h_POST:
3849 case AArch64::LD3Threev16b_POST:
3850 case AArch64::LD3Threev2d_POST:
3851 case AArch64::LD3Threev2s_POST:
3852 case AArch64::LD3Threev4h_POST:
3853 case AArch64::LD3Threev4s_POST:
3854 case AArch64::LD3Threev8b_POST:
3855 case AArch64::LD3Threev8h_POST:
3856 case AArch64::LD3i16_POST:
3857 case AArch64::LD3i32_POST:
3858 case AArch64::LD3i64_POST:
3859 case AArch64::LD3i8_POST:
3860 case AArch64::LD4Fourv16b_POST:
3861 case AArch64::LD4Fourv2d_POST:
3862 case AArch64::LD4Fourv2s_POST:
3863 case AArch64::LD4Fourv4h_POST:
3864 case AArch64::LD4Fourv4s_POST:
3865 case AArch64::LD4Fourv8b_POST:
3866 case AArch64::LD4Fourv8h_POST:
3867 case AArch64::LD4Rv16b_POST:
3868 case AArch64::LD4Rv1d_POST:
3869 case AArch64::LD4Rv2d_POST:
3870 case AArch64::LD4Rv2s_POST:
3871 case AArch64::LD4Rv4h_POST:
3872 case AArch64::LD4Rv4s_POST:
3873 case AArch64::LD4Rv8b_POST:
3874 case AArch64::LD4Rv8h_POST:
3875 case AArch64::LD4i16_POST:
3876 case AArch64::LD4i32_POST:
3877 case AArch64::LD4i64_POST:
3878 case AArch64::LD4i8_POST:
3879 case AArch64::LDAPRWpost:
3880 case AArch64::LDAPRXpost:
3881 case AArch64::LDIAPPWpost:
3882 case AArch64::LDIAPPXpost:
3883 case AArch64::LDPDpost:
3884 case AArch64::LDPQpost:
3885 case AArch64::LDPSWpost:
3886 case AArch64::LDPSpost:
3887 case AArch64::LDPWpost:
3888 case AArch64::LDPXpost:
3889 case AArch64::LDRBBpost:
3890 case AArch64::LDRBpost:
3891 case AArch64::LDRDpost:
3892 case AArch64::LDRHHpost:
3893 case AArch64::LDRHpost:
3894 case AArch64::LDRQpost:
3895 case AArch64::LDRSBWpost:
3896 case AArch64::LDRSBXpost:
3897 case AArch64::LDRSHWpost:
3898 case AArch64::LDRSHXpost:
3899 case AArch64::LDRSWpost:
3900 case AArch64::LDRSpost:
3901 case AArch64::LDRWpost:
3902 case AArch64::LDRXpost:
3903 case AArch64::ST1Fourv16b_POST:
3904 case AArch64::ST1Fourv1d_POST:
3905 case AArch64::ST1Fourv2d_POST:
3906 case AArch64::ST1Fourv2s_POST:
3907 case AArch64::ST1Fourv4h_POST:
3908 case AArch64::ST1Fourv4s_POST:
3909 case AArch64::ST1Fourv8b_POST:
3910 case AArch64::ST1Fourv8h_POST:
3911 case AArch64::ST1Onev16b_POST:
3912 case AArch64::ST1Onev1d_POST:
3913 case AArch64::ST1Onev2d_POST:
3914 case AArch64::ST1Onev2s_POST:
3915 case AArch64::ST1Onev4h_POST:
3916 case AArch64::ST1Onev4s_POST:
3917 case AArch64::ST1Onev8b_POST:
3918 case AArch64::ST1Onev8h_POST:
3919 case AArch64::ST1Threev16b_POST:
3920 case AArch64::ST1Threev1d_POST:
3921 case AArch64::ST1Threev2d_POST:
3922 case AArch64::ST1Threev2s_POST:
3923 case AArch64::ST1Threev4h_POST:
3924 case AArch64::ST1Threev4s_POST:
3925 case AArch64::ST1Threev8b_POST:
3926 case AArch64::ST1Threev8h_POST:
3927 case AArch64::ST1Twov16b_POST:
3928 case AArch64::ST1Twov1d_POST:
3929 case AArch64::ST1Twov2d_POST:
3930 case AArch64::ST1Twov2s_POST:
3931 case AArch64::ST1Twov4h_POST:
3932 case AArch64::ST1Twov4s_POST:
3933 case AArch64::ST1Twov8b_POST:
3934 case AArch64::ST1Twov8h_POST:
3935 case AArch64::ST1i16_POST:
3936 case AArch64::ST1i32_POST:
3937 case AArch64::ST1i64_POST:
3938 case AArch64::ST1i8_POST:
3939 case AArch64::ST2GPostIndex:
3940 case AArch64::ST2Twov16b_POST:
3941 case AArch64::ST2Twov2d_POST:
3942 case AArch64::ST2Twov2s_POST:
3943 case AArch64::ST2Twov4h_POST:
3944 case AArch64::ST2Twov4s_POST:
3945 case AArch64::ST2Twov8b_POST:
3946 case AArch64::ST2Twov8h_POST:
3947 case AArch64::ST2i16_POST:
3948 case AArch64::ST2i32_POST:
3949 case AArch64::ST2i64_POST:
3950 case AArch64::ST2i8_POST:
3951 case AArch64::ST3Threev16b_POST:
3952 case AArch64::ST3Threev2d_POST:
3953 case AArch64::ST3Threev2s_POST:
3954 case AArch64::ST3Threev4h_POST:
3955 case AArch64::ST3Threev4s_POST:
3956 case AArch64::ST3Threev8b_POST:
3957 case AArch64::ST3Threev8h_POST:
3958 case AArch64::ST3i16_POST:
3959 case AArch64::ST3i32_POST:
3960 case AArch64::ST3i64_POST:
3961 case AArch64::ST3i8_POST:
3962 case AArch64::ST4Fourv16b_POST:
3963 case AArch64::ST4Fourv2d_POST:
3964 case AArch64::ST4Fourv2s_POST:
3965 case AArch64::ST4Fourv4h_POST:
3966 case AArch64::ST4Fourv4s_POST:
3967 case AArch64::ST4Fourv8b_POST:
3968 case AArch64::ST4Fourv8h_POST:
3969 case AArch64::ST4i16_POST:
3970 case AArch64::ST4i32_POST:
3971 case AArch64::ST4i64_POST:
3972 case AArch64::ST4i8_POST:
3973 case AArch64::STGPostIndex:
3974 case AArch64::STGPpost:
3975 case AArch64::STPDpost:
3976 case AArch64::STPQpost:
3977 case AArch64::STPSpost:
3978 case AArch64::STPWpost:
3979 case AArch64::STPXpost:
3980 case AArch64::STRBBpost:
3981 case AArch64::STRBpost:
3982 case AArch64::STRDpost:
3983 case AArch64::STRHHpost:
3984 case AArch64::STRHpost:
3985 case AArch64::STRQpost:
3986 case AArch64::STRSpost:
3987 case AArch64::STRWpost:
3988 case AArch64::STRXpost:
3989 case AArch64::STZ2GPostIndex:
3990 case AArch64::STZGPostIndex:
3991 return true;
3992 }
3993 }
3994
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3995 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3996 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3997 bool &OffsetIsScalable, TypeSize &Width,
3998 const TargetRegisterInfo *TRI) const {
3999 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4000 // Handle only loads/stores with base register followed by immediate offset.
4001 if (LdSt.getNumExplicitOperands() == 3) {
4002 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
4003 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
4004 !LdSt.getOperand(2).isImm())
4005 return false;
4006 } else if (LdSt.getNumExplicitOperands() == 4) {
4007 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
4008 if (!LdSt.getOperand(1).isReg() ||
4009 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
4010 !LdSt.getOperand(3).isImm())
4011 return false;
4012 } else
4013 return false;
4014
4015 // Get the scaling factor for the instruction and set the width for the
4016 // instruction.
4017 TypeSize Scale(0U, false);
4018 int64_t Dummy1, Dummy2;
4019
4020 // If this returns false, then it's an instruction we don't want to handle.
4021 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
4022 return false;
4023
4024 // Compute the offset. Offset is calculated as the immediate operand
4025 // multiplied by the scaling factor. Unscaled instructions have scaling factor
4026 // set to 1. Postindex are a special case which have an offset of 0.
4027 if (isPostIndexLdStOpcode(LdSt.getOpcode())) {
4028 BaseOp = &LdSt.getOperand(2);
4029 Offset = 0;
4030 } else if (LdSt.getNumExplicitOperands() == 3) {
4031 BaseOp = &LdSt.getOperand(1);
4032 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
4033 } else {
4034 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
4035 BaseOp = &LdSt.getOperand(2);
4036 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
4037 }
4038 OffsetIsScalable = Scale.isScalable();
4039
4040 return BaseOp->isReg() || BaseOp->isFI();
4041 }
4042
4043 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const4044 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
4045 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
4046 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
4047 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
4048 return OfsOp;
4049 }
4050
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)4051 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
4052 TypeSize &Width, int64_t &MinOffset,
4053 int64_t &MaxOffset) {
4054 switch (Opcode) {
4055 // Not a memory operation or something we want to handle.
4056 default:
4057 Scale = TypeSize::getFixed(0);
4058 Width = TypeSize::getFixed(0);
4059 MinOffset = MaxOffset = 0;
4060 return false;
4061 // LDR / STR
4062 case AArch64::LDRQui:
4063 case AArch64::STRQui:
4064 Scale = TypeSize::getFixed(16);
4065 Width = TypeSize::getFixed(16);
4066 MinOffset = 0;
4067 MaxOffset = 4095;
4068 break;
4069 case AArch64::LDRXui:
4070 case AArch64::LDRDui:
4071 case AArch64::STRXui:
4072 case AArch64::STRDui:
4073 case AArch64::PRFMui:
4074 Scale = TypeSize::getFixed(8);
4075 Width = TypeSize::getFixed(8);
4076 MinOffset = 0;
4077 MaxOffset = 4095;
4078 break;
4079 case AArch64::LDRWui:
4080 case AArch64::LDRSui:
4081 case AArch64::LDRSWui:
4082 case AArch64::STRWui:
4083 case AArch64::STRSui:
4084 Scale = TypeSize::getFixed(4);
4085 Width = TypeSize::getFixed(4);
4086 MinOffset = 0;
4087 MaxOffset = 4095;
4088 break;
4089 case AArch64::LDRHui:
4090 case AArch64::LDRHHui:
4091 case AArch64::LDRSHWui:
4092 case AArch64::LDRSHXui:
4093 case AArch64::STRHui:
4094 case AArch64::STRHHui:
4095 Scale = TypeSize::getFixed(2);
4096 Width = TypeSize::getFixed(2);
4097 MinOffset = 0;
4098 MaxOffset = 4095;
4099 break;
4100 case AArch64::LDRBui:
4101 case AArch64::LDRBBui:
4102 case AArch64::LDRSBWui:
4103 case AArch64::LDRSBXui:
4104 case AArch64::STRBui:
4105 case AArch64::STRBBui:
4106 Scale = TypeSize::getFixed(1);
4107 Width = TypeSize::getFixed(1);
4108 MinOffset = 0;
4109 MaxOffset = 4095;
4110 break;
4111 // post/pre inc
4112 case AArch64::STRQpre:
4113 case AArch64::LDRQpost:
4114 Scale = TypeSize::getFixed(1);
4115 Width = TypeSize::getFixed(16);
4116 MinOffset = -256;
4117 MaxOffset = 255;
4118 break;
4119 case AArch64::LDRDpost:
4120 case AArch64::LDRDpre:
4121 case AArch64::LDRXpost:
4122 case AArch64::LDRXpre:
4123 case AArch64::STRDpost:
4124 case AArch64::STRDpre:
4125 case AArch64::STRXpost:
4126 case AArch64::STRXpre:
4127 Scale = TypeSize::getFixed(1);
4128 Width = TypeSize::getFixed(8);
4129 MinOffset = -256;
4130 MaxOffset = 255;
4131 break;
4132 case AArch64::STRWpost:
4133 case AArch64::STRWpre:
4134 case AArch64::LDRWpost:
4135 case AArch64::LDRWpre:
4136 case AArch64::STRSpost:
4137 case AArch64::STRSpre:
4138 case AArch64::LDRSpost:
4139 case AArch64::LDRSpre:
4140 Scale = TypeSize::getFixed(1);
4141 Width = TypeSize::getFixed(4);
4142 MinOffset = -256;
4143 MaxOffset = 255;
4144 break;
4145 case AArch64::LDRHpost:
4146 case AArch64::LDRHpre:
4147 case AArch64::STRHpost:
4148 case AArch64::STRHpre:
4149 case AArch64::LDRHHpost:
4150 case AArch64::LDRHHpre:
4151 case AArch64::STRHHpost:
4152 case AArch64::STRHHpre:
4153 Scale = TypeSize::getFixed(1);
4154 Width = TypeSize::getFixed(2);
4155 MinOffset = -256;
4156 MaxOffset = 255;
4157 break;
4158 case AArch64::LDRBpost:
4159 case AArch64::LDRBpre:
4160 case AArch64::STRBpost:
4161 case AArch64::STRBpre:
4162 case AArch64::LDRBBpost:
4163 case AArch64::LDRBBpre:
4164 case AArch64::STRBBpost:
4165 case AArch64::STRBBpre:
4166 Scale = TypeSize::getFixed(1);
4167 Width = TypeSize::getFixed(1);
4168 MinOffset = -256;
4169 MaxOffset = 255;
4170 break;
4171 // Unscaled
4172 case AArch64::LDURQi:
4173 case AArch64::STURQi:
4174 Scale = TypeSize::getFixed(1);
4175 Width = TypeSize::getFixed(16);
4176 MinOffset = -256;
4177 MaxOffset = 255;
4178 break;
4179 case AArch64::LDURXi:
4180 case AArch64::LDURDi:
4181 case AArch64::LDAPURXi:
4182 case AArch64::STURXi:
4183 case AArch64::STURDi:
4184 case AArch64::STLURXi:
4185 case AArch64::PRFUMi:
4186 Scale = TypeSize::getFixed(1);
4187 Width = TypeSize::getFixed(8);
4188 MinOffset = -256;
4189 MaxOffset = 255;
4190 break;
4191 case AArch64::LDURWi:
4192 case AArch64::LDURSi:
4193 case AArch64::LDURSWi:
4194 case AArch64::LDAPURi:
4195 case AArch64::LDAPURSWi:
4196 case AArch64::STURWi:
4197 case AArch64::STURSi:
4198 case AArch64::STLURWi:
4199 Scale = TypeSize::getFixed(1);
4200 Width = TypeSize::getFixed(4);
4201 MinOffset = -256;
4202 MaxOffset = 255;
4203 break;
4204 case AArch64::LDURHi:
4205 case AArch64::LDURHHi:
4206 case AArch64::LDURSHXi:
4207 case AArch64::LDURSHWi:
4208 case AArch64::LDAPURHi:
4209 case AArch64::LDAPURSHWi:
4210 case AArch64::LDAPURSHXi:
4211 case AArch64::STURHi:
4212 case AArch64::STURHHi:
4213 case AArch64::STLURHi:
4214 Scale = TypeSize::getFixed(1);
4215 Width = TypeSize::getFixed(2);
4216 MinOffset = -256;
4217 MaxOffset = 255;
4218 break;
4219 case AArch64::LDURBi:
4220 case AArch64::LDURBBi:
4221 case AArch64::LDURSBXi:
4222 case AArch64::LDURSBWi:
4223 case AArch64::LDAPURBi:
4224 case AArch64::LDAPURSBWi:
4225 case AArch64::LDAPURSBXi:
4226 case AArch64::STURBi:
4227 case AArch64::STURBBi:
4228 case AArch64::STLURBi:
4229 Scale = TypeSize::getFixed(1);
4230 Width = TypeSize::getFixed(1);
4231 MinOffset = -256;
4232 MaxOffset = 255;
4233 break;
4234 // LDP / STP (including pre/post inc)
4235 case AArch64::LDPQi:
4236 case AArch64::LDNPQi:
4237 case AArch64::STPQi:
4238 case AArch64::STNPQi:
4239 case AArch64::LDPQpost:
4240 case AArch64::LDPQpre:
4241 case AArch64::STPQpost:
4242 case AArch64::STPQpre:
4243 Scale = TypeSize::getFixed(16);
4244 Width = TypeSize::getFixed(16 * 2);
4245 MinOffset = -64;
4246 MaxOffset = 63;
4247 break;
4248 case AArch64::LDPXi:
4249 case AArch64::LDPDi:
4250 case AArch64::LDNPXi:
4251 case AArch64::LDNPDi:
4252 case AArch64::STPXi:
4253 case AArch64::STPDi:
4254 case AArch64::STNPXi:
4255 case AArch64::STNPDi:
4256 case AArch64::LDPDpost:
4257 case AArch64::LDPDpre:
4258 case AArch64::LDPXpost:
4259 case AArch64::LDPXpre:
4260 case AArch64::STPDpost:
4261 case AArch64::STPDpre:
4262 case AArch64::STPXpost:
4263 case AArch64::STPXpre:
4264 Scale = TypeSize::getFixed(8);
4265 Width = TypeSize::getFixed(8 * 2);
4266 MinOffset = -64;
4267 MaxOffset = 63;
4268 break;
4269 case AArch64::LDPWi:
4270 case AArch64::LDPSi:
4271 case AArch64::LDNPWi:
4272 case AArch64::LDNPSi:
4273 case AArch64::STPWi:
4274 case AArch64::STPSi:
4275 case AArch64::STNPWi:
4276 case AArch64::STNPSi:
4277 case AArch64::LDPSpost:
4278 case AArch64::LDPSpre:
4279 case AArch64::LDPWpost:
4280 case AArch64::LDPWpre:
4281 case AArch64::STPSpost:
4282 case AArch64::STPSpre:
4283 case AArch64::STPWpost:
4284 case AArch64::STPWpre:
4285 Scale = TypeSize::getFixed(4);
4286 Width = TypeSize::getFixed(4 * 2);
4287 MinOffset = -64;
4288 MaxOffset = 63;
4289 break;
4290 case AArch64::StoreSwiftAsyncContext:
4291 // Store is an STRXui, but there might be an ADDXri in the expansion too.
4292 Scale = TypeSize::getFixed(1);
4293 Width = TypeSize::getFixed(8);
4294 MinOffset = 0;
4295 MaxOffset = 4095;
4296 break;
4297 case AArch64::ADDG:
4298 Scale = TypeSize::getFixed(16);
4299 Width = TypeSize::getFixed(0);
4300 MinOffset = 0;
4301 MaxOffset = 63;
4302 break;
4303 case AArch64::TAGPstack:
4304 Scale = TypeSize::getFixed(16);
4305 Width = TypeSize::getFixed(0);
4306 // TAGP with a negative offset turns into SUBP, which has a maximum offset
4307 // of 63 (not 64!).
4308 MinOffset = -63;
4309 MaxOffset = 63;
4310 break;
4311 case AArch64::LDG:
4312 case AArch64::STGi:
4313 case AArch64::STGPreIndex:
4314 case AArch64::STGPostIndex:
4315 case AArch64::STZGi:
4316 case AArch64::STZGPreIndex:
4317 case AArch64::STZGPostIndex:
4318 Scale = TypeSize::getFixed(16);
4319 Width = TypeSize::getFixed(16);
4320 MinOffset = -256;
4321 MaxOffset = 255;
4322 break;
4323 // SVE
4324 case AArch64::STR_ZZZZXI:
4325 case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
4326 case AArch64::LDR_ZZZZXI:
4327 case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
4328 Scale = TypeSize::getScalable(16);
4329 Width = TypeSize::getScalable(16 * 4);
4330 MinOffset = -256;
4331 MaxOffset = 252;
4332 break;
4333 case AArch64::STR_ZZZXI:
4334 case AArch64::LDR_ZZZXI:
4335 Scale = TypeSize::getScalable(16);
4336 Width = TypeSize::getScalable(16 * 3);
4337 MinOffset = -256;
4338 MaxOffset = 253;
4339 break;
4340 case AArch64::STR_ZZXI:
4341 case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
4342 case AArch64::LDR_ZZXI:
4343 case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
4344 Scale = TypeSize::getScalable(16);
4345 Width = TypeSize::getScalable(16 * 2);
4346 MinOffset = -256;
4347 MaxOffset = 254;
4348 break;
4349 case AArch64::LDR_PXI:
4350 case AArch64::STR_PXI:
4351 Scale = TypeSize::getScalable(2);
4352 Width = TypeSize::getScalable(2);
4353 MinOffset = -256;
4354 MaxOffset = 255;
4355 break;
4356 case AArch64::LDR_PPXI:
4357 case AArch64::STR_PPXI:
4358 Scale = TypeSize::getScalable(2);
4359 Width = TypeSize::getScalable(2 * 2);
4360 MinOffset = -256;
4361 MaxOffset = 254;
4362 break;
4363 case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4364 case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4365 case AArch64::LDR_ZXI:
4366 case AArch64::STR_ZXI:
4367 Scale = TypeSize::getScalable(16);
4368 Width = TypeSize::getScalable(16);
4369 MinOffset = -256;
4370 MaxOffset = 255;
4371 break;
4372 case AArch64::LD1B_IMM:
4373 case AArch64::LD1H_IMM:
4374 case AArch64::LD1W_IMM:
4375 case AArch64::LD1D_IMM:
4376 case AArch64::LDNT1B_ZRI:
4377 case AArch64::LDNT1H_ZRI:
4378 case AArch64::LDNT1W_ZRI:
4379 case AArch64::LDNT1D_ZRI:
4380 case AArch64::ST1B_IMM:
4381 case AArch64::ST1H_IMM:
4382 case AArch64::ST1W_IMM:
4383 case AArch64::ST1D_IMM:
4384 case AArch64::STNT1B_ZRI:
4385 case AArch64::STNT1H_ZRI:
4386 case AArch64::STNT1W_ZRI:
4387 case AArch64::STNT1D_ZRI:
4388 case AArch64::LDNF1B_IMM:
4389 case AArch64::LDNF1H_IMM:
4390 case AArch64::LDNF1W_IMM:
4391 case AArch64::LDNF1D_IMM:
4392 // A full vectors worth of data
4393 // Width = mbytes * elements
4394 Scale = TypeSize::getScalable(16);
4395 Width = TypeSize::getScalable(16);
4396 MinOffset = -8;
4397 MaxOffset = 7;
4398 break;
4399 case AArch64::LD2B_IMM:
4400 case AArch64::LD2H_IMM:
4401 case AArch64::LD2W_IMM:
4402 case AArch64::LD2D_IMM:
4403 case AArch64::ST2B_IMM:
4404 case AArch64::ST2H_IMM:
4405 case AArch64::ST2W_IMM:
4406 case AArch64::ST2D_IMM:
4407 Scale = TypeSize::getScalable(32);
4408 Width = TypeSize::getScalable(16 * 2);
4409 MinOffset = -8;
4410 MaxOffset = 7;
4411 break;
4412 case AArch64::LD3B_IMM:
4413 case AArch64::LD3H_IMM:
4414 case AArch64::LD3W_IMM:
4415 case AArch64::LD3D_IMM:
4416 case AArch64::ST3B_IMM:
4417 case AArch64::ST3H_IMM:
4418 case AArch64::ST3W_IMM:
4419 case AArch64::ST3D_IMM:
4420 Scale = TypeSize::getScalable(48);
4421 Width = TypeSize::getScalable(16 * 3);
4422 MinOffset = -8;
4423 MaxOffset = 7;
4424 break;
4425 case AArch64::LD4B_IMM:
4426 case AArch64::LD4H_IMM:
4427 case AArch64::LD4W_IMM:
4428 case AArch64::LD4D_IMM:
4429 case AArch64::ST4B_IMM:
4430 case AArch64::ST4H_IMM:
4431 case AArch64::ST4W_IMM:
4432 case AArch64::ST4D_IMM:
4433 Scale = TypeSize::getScalable(64);
4434 Width = TypeSize::getScalable(16 * 4);
4435 MinOffset = -8;
4436 MaxOffset = 7;
4437 break;
4438 case AArch64::LD1B_H_IMM:
4439 case AArch64::LD1SB_H_IMM:
4440 case AArch64::LD1H_S_IMM:
4441 case AArch64::LD1SH_S_IMM:
4442 case AArch64::LD1W_D_IMM:
4443 case AArch64::LD1SW_D_IMM:
4444 case AArch64::ST1B_H_IMM:
4445 case AArch64::ST1H_S_IMM:
4446 case AArch64::ST1W_D_IMM:
4447 case AArch64::LDNF1B_H_IMM:
4448 case AArch64::LDNF1SB_H_IMM:
4449 case AArch64::LDNF1H_S_IMM:
4450 case AArch64::LDNF1SH_S_IMM:
4451 case AArch64::LDNF1W_D_IMM:
4452 case AArch64::LDNF1SW_D_IMM:
4453 // A half vector worth of data
4454 // Width = mbytes * elements
4455 Scale = TypeSize::getScalable(8);
4456 Width = TypeSize::getScalable(8);
4457 MinOffset = -8;
4458 MaxOffset = 7;
4459 break;
4460 case AArch64::LD1B_S_IMM:
4461 case AArch64::LD1SB_S_IMM:
4462 case AArch64::LD1H_D_IMM:
4463 case AArch64::LD1SH_D_IMM:
4464 case AArch64::ST1B_S_IMM:
4465 case AArch64::ST1H_D_IMM:
4466 case AArch64::LDNF1B_S_IMM:
4467 case AArch64::LDNF1SB_S_IMM:
4468 case AArch64::LDNF1H_D_IMM:
4469 case AArch64::LDNF1SH_D_IMM:
4470 // A quarter vector worth of data
4471 // Width = mbytes * elements
4472 Scale = TypeSize::getScalable(4);
4473 Width = TypeSize::getScalable(4);
4474 MinOffset = -8;
4475 MaxOffset = 7;
4476 break;
4477 case AArch64::LD1B_D_IMM:
4478 case AArch64::LD1SB_D_IMM:
4479 case AArch64::ST1B_D_IMM:
4480 case AArch64::LDNF1B_D_IMM:
4481 case AArch64::LDNF1SB_D_IMM:
4482 // A eighth vector worth of data
4483 // Width = mbytes * elements
4484 Scale = TypeSize::getScalable(2);
4485 Width = TypeSize::getScalable(2);
4486 MinOffset = -8;
4487 MaxOffset = 7;
4488 break;
4489 case AArch64::ST2Gi:
4490 case AArch64::ST2GPreIndex:
4491 case AArch64::ST2GPostIndex:
4492 case AArch64::STZ2Gi:
4493 case AArch64::STZ2GPreIndex:
4494 case AArch64::STZ2GPostIndex:
4495 Scale = TypeSize::getFixed(16);
4496 Width = TypeSize::getFixed(32);
4497 MinOffset = -256;
4498 MaxOffset = 255;
4499 break;
4500 case AArch64::STGPi:
4501 case AArch64::STGPpost:
4502 case AArch64::STGPpre:
4503 Scale = TypeSize::getFixed(16);
4504 Width = TypeSize::getFixed(16);
4505 MinOffset = -64;
4506 MaxOffset = 63;
4507 break;
4508 case AArch64::LD1RB_IMM:
4509 case AArch64::LD1RB_H_IMM:
4510 case AArch64::LD1RB_S_IMM:
4511 case AArch64::LD1RB_D_IMM:
4512 case AArch64::LD1RSB_H_IMM:
4513 case AArch64::LD1RSB_S_IMM:
4514 case AArch64::LD1RSB_D_IMM:
4515 Scale = TypeSize::getFixed(1);
4516 Width = TypeSize::getFixed(1);
4517 MinOffset = 0;
4518 MaxOffset = 63;
4519 break;
4520 case AArch64::LD1RH_IMM:
4521 case AArch64::LD1RH_S_IMM:
4522 case AArch64::LD1RH_D_IMM:
4523 case AArch64::LD1RSH_S_IMM:
4524 case AArch64::LD1RSH_D_IMM:
4525 Scale = TypeSize::getFixed(2);
4526 Width = TypeSize::getFixed(2);
4527 MinOffset = 0;
4528 MaxOffset = 63;
4529 break;
4530 case AArch64::LD1RW_IMM:
4531 case AArch64::LD1RW_D_IMM:
4532 case AArch64::LD1RSW_IMM:
4533 Scale = TypeSize::getFixed(4);
4534 Width = TypeSize::getFixed(4);
4535 MinOffset = 0;
4536 MaxOffset = 63;
4537 break;
4538 case AArch64::LD1RD_IMM:
4539 Scale = TypeSize::getFixed(8);
4540 Width = TypeSize::getFixed(8);
4541 MinOffset = 0;
4542 MaxOffset = 63;
4543 break;
4544 }
4545
4546 return true;
4547 }
4548
4549 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)4550 int AArch64InstrInfo::getMemScale(unsigned Opc) {
4551 switch (Opc) {
4552 default:
4553 llvm_unreachable("Opcode has unknown scale!");
4554 case AArch64::LDRBBui:
4555 case AArch64::LDURBBi:
4556 case AArch64::LDRSBWui:
4557 case AArch64::LDURSBWi:
4558 case AArch64::STRBBui:
4559 case AArch64::STURBBi:
4560 return 1;
4561 case AArch64::LDRHHui:
4562 case AArch64::LDURHHi:
4563 case AArch64::LDRSHWui:
4564 case AArch64::LDURSHWi:
4565 case AArch64::STRHHui:
4566 case AArch64::STURHHi:
4567 return 2;
4568 case AArch64::LDRSui:
4569 case AArch64::LDURSi:
4570 case AArch64::LDRSpre:
4571 case AArch64::LDRSWui:
4572 case AArch64::LDURSWi:
4573 case AArch64::LDRSWpre:
4574 case AArch64::LDRWpre:
4575 case AArch64::LDRWui:
4576 case AArch64::LDURWi:
4577 case AArch64::STRSui:
4578 case AArch64::STURSi:
4579 case AArch64::STRSpre:
4580 case AArch64::STRWui:
4581 case AArch64::STURWi:
4582 case AArch64::STRWpre:
4583 case AArch64::LDPSi:
4584 case AArch64::LDPSWi:
4585 case AArch64::LDPWi:
4586 case AArch64::STPSi:
4587 case AArch64::STPWi:
4588 return 4;
4589 case AArch64::LDRDui:
4590 case AArch64::LDURDi:
4591 case AArch64::LDRDpre:
4592 case AArch64::LDRXui:
4593 case AArch64::LDURXi:
4594 case AArch64::LDRXpre:
4595 case AArch64::STRDui:
4596 case AArch64::STURDi:
4597 case AArch64::STRDpre:
4598 case AArch64::STRXui:
4599 case AArch64::STURXi:
4600 case AArch64::STRXpre:
4601 case AArch64::LDPDi:
4602 case AArch64::LDPXi:
4603 case AArch64::STPDi:
4604 case AArch64::STPXi:
4605 return 8;
4606 case AArch64::LDRQui:
4607 case AArch64::LDURQi:
4608 case AArch64::STRQui:
4609 case AArch64::STURQi:
4610 case AArch64::STRQpre:
4611 case AArch64::LDPQi:
4612 case AArch64::LDRQpre:
4613 case AArch64::STPQi:
4614 case AArch64::STGi:
4615 case AArch64::STZGi:
4616 case AArch64::ST2Gi:
4617 case AArch64::STZ2Gi:
4618 case AArch64::STGPi:
4619 return 16;
4620 }
4621 }
4622
isPreLd(const MachineInstr & MI)4623 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4624 switch (MI.getOpcode()) {
4625 default:
4626 return false;
4627 case AArch64::LDRWpre:
4628 case AArch64::LDRXpre:
4629 case AArch64::LDRSWpre:
4630 case AArch64::LDRSpre:
4631 case AArch64::LDRDpre:
4632 case AArch64::LDRQpre:
4633 return true;
4634 }
4635 }
4636
isPreSt(const MachineInstr & MI)4637 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4638 switch (MI.getOpcode()) {
4639 default:
4640 return false;
4641 case AArch64::STRWpre:
4642 case AArch64::STRXpre:
4643 case AArch64::STRSpre:
4644 case AArch64::STRDpre:
4645 case AArch64::STRQpre:
4646 return true;
4647 }
4648 }
4649
isPreLdSt(const MachineInstr & MI)4650 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4651 return isPreLd(MI) || isPreSt(MI);
4652 }
4653
isPairedLdSt(const MachineInstr & MI)4654 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4655 switch (MI.getOpcode()) {
4656 default:
4657 return false;
4658 case AArch64::LDPSi:
4659 case AArch64::LDPSWi:
4660 case AArch64::LDPDi:
4661 case AArch64::LDPQi:
4662 case AArch64::LDPWi:
4663 case AArch64::LDPXi:
4664 case AArch64::STPSi:
4665 case AArch64::STPDi:
4666 case AArch64::STPQi:
4667 case AArch64::STPWi:
4668 case AArch64::STPXi:
4669 case AArch64::STGPi:
4670 return true;
4671 }
4672 }
4673
getLdStBaseOp(const MachineInstr & MI)4674 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4675 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4676 unsigned Idx =
4677 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4678 : 1;
4679 return MI.getOperand(Idx);
4680 }
4681
4682 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4683 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4684 assert(MI.mayLoadOrStore() && "Load or store instruction expected");
4685 unsigned Idx =
4686 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4687 : 2;
4688 return MI.getOperand(Idx);
4689 }
4690
4691 const MachineOperand &
getLdStAmountOp(const MachineInstr & MI)4692 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
4693 switch (MI.getOpcode()) {
4694 default:
4695 llvm_unreachable("Unexpected opcode");
4696 case AArch64::LDRBroX:
4697 case AArch64::LDRBBroX:
4698 case AArch64::LDRSBXroX:
4699 case AArch64::LDRSBWroX:
4700 case AArch64::LDRHroX:
4701 case AArch64::LDRHHroX:
4702 case AArch64::LDRSHXroX:
4703 case AArch64::LDRSHWroX:
4704 case AArch64::LDRWroX:
4705 case AArch64::LDRSroX:
4706 case AArch64::LDRSWroX:
4707 case AArch64::LDRDroX:
4708 case AArch64::LDRXroX:
4709 case AArch64::LDRQroX:
4710 return MI.getOperand(4);
4711 }
4712 }
4713
getRegClass(const MachineInstr & MI,Register Reg)4714 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4715 Register Reg) {
4716 if (MI.getParent() == nullptr)
4717 return nullptr;
4718 const MachineFunction *MF = MI.getParent()->getParent();
4719 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4720 }
4721
isHForm(const MachineInstr & MI)4722 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4723 auto IsHFPR = [&](const MachineOperand &Op) {
4724 if (!Op.isReg())
4725 return false;
4726 auto Reg = Op.getReg();
4727 if (Reg.isPhysical())
4728 return AArch64::FPR16RegClass.contains(Reg);
4729 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4730 return TRC == &AArch64::FPR16RegClass ||
4731 TRC == &AArch64::FPR16_loRegClass;
4732 };
4733 return llvm::any_of(MI.operands(), IsHFPR);
4734 }
4735
isQForm(const MachineInstr & MI)4736 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4737 auto IsQFPR = [&](const MachineOperand &Op) {
4738 if (!Op.isReg())
4739 return false;
4740 auto Reg = Op.getReg();
4741 if (Reg.isPhysical())
4742 return AArch64::FPR128RegClass.contains(Reg);
4743 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4744 return TRC == &AArch64::FPR128RegClass ||
4745 TRC == &AArch64::FPR128_loRegClass;
4746 };
4747 return llvm::any_of(MI.operands(), IsQFPR);
4748 }
4749
hasBTISemantics(const MachineInstr & MI)4750 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4751 switch (MI.getOpcode()) {
4752 case AArch64::BRK:
4753 case AArch64::HLT:
4754 case AArch64::PACIASP:
4755 case AArch64::PACIBSP:
4756 // Implicit BTI behavior.
4757 return true;
4758 case AArch64::PAUTH_PROLOGUE:
4759 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4760 return true;
4761 case AArch64::HINT: {
4762 unsigned Imm = MI.getOperand(0).getImm();
4763 // Explicit BTI instruction.
4764 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4765 return true;
4766 // PACI(A|B)SP instructions.
4767 if (Imm == 25 || Imm == 27)
4768 return true;
4769 return false;
4770 }
4771 default:
4772 return false;
4773 }
4774 }
4775
isFpOrNEON(Register Reg)4776 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4777 if (Reg == 0)
4778 return false;
4779 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4780 return AArch64::FPR128RegClass.contains(Reg) ||
4781 AArch64::FPR64RegClass.contains(Reg) ||
4782 AArch64::FPR32RegClass.contains(Reg) ||
4783 AArch64::FPR16RegClass.contains(Reg) ||
4784 AArch64::FPR8RegClass.contains(Reg);
4785 }
4786
isFpOrNEON(const MachineInstr & MI)4787 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4788 auto IsFPR = [&](const MachineOperand &Op) {
4789 if (!Op.isReg())
4790 return false;
4791 auto Reg = Op.getReg();
4792 if (Reg.isPhysical())
4793 return isFpOrNEON(Reg);
4794
4795 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4796 return TRC == &AArch64::FPR128RegClass ||
4797 TRC == &AArch64::FPR128_loRegClass ||
4798 TRC == &AArch64::FPR64RegClass ||
4799 TRC == &AArch64::FPR64_loRegClass ||
4800 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4801 TRC == &AArch64::FPR8RegClass;
4802 };
4803 return llvm::any_of(MI.operands(), IsFPR);
4804 }
4805
4806 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
4807 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4808 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4809 int Scale = AArch64InstrInfo::getMemScale(Opc);
4810
4811 // If the byte-offset isn't a multiple of the stride, we can't scale this
4812 // offset.
4813 if (Offset % Scale != 0)
4814 return false;
4815
4816 // Convert the byte-offset used by unscaled into an "element" offset used
4817 // by the scaled pair load/store instructions.
4818 Offset /= Scale;
4819 return true;
4820 }
4821
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4822 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4823 if (FirstOpc == SecondOpc)
4824 return true;
4825 // We can also pair sign-ext and zero-ext instructions.
4826 switch (FirstOpc) {
4827 default:
4828 return false;
4829 case AArch64::STRSui:
4830 case AArch64::STURSi:
4831 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4832 case AArch64::STRDui:
4833 case AArch64::STURDi:
4834 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4835 case AArch64::STRQui:
4836 case AArch64::STURQi:
4837 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4838 case AArch64::STRWui:
4839 case AArch64::STURWi:
4840 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4841 case AArch64::STRXui:
4842 case AArch64::STURXi:
4843 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4844 case AArch64::LDRSui:
4845 case AArch64::LDURSi:
4846 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4847 case AArch64::LDRDui:
4848 case AArch64::LDURDi:
4849 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4850 case AArch64::LDRQui:
4851 case AArch64::LDURQi:
4852 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4853 case AArch64::LDRWui:
4854 case AArch64::LDURWi:
4855 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4856 case AArch64::LDRSWui:
4857 case AArch64::LDURSWi:
4858 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4859 case AArch64::LDRXui:
4860 case AArch64::LDURXi:
4861 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4862 }
4863 // These instructions can't be paired based on their opcodes.
4864 return false;
4865 }
4866
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4867 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4868 int64_t Offset1, unsigned Opcode1, int FI2,
4869 int64_t Offset2, unsigned Opcode2) {
4870 // Accesses through fixed stack object frame indices may access a different
4871 // fixed stack slot. Check that the object offsets + offsets match.
4872 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4873 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4874 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4875 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4876 // Convert to scaled object offsets.
4877 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4878 if (ObjectOffset1 % Scale1 != 0)
4879 return false;
4880 ObjectOffset1 /= Scale1;
4881 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4882 if (ObjectOffset2 % Scale2 != 0)
4883 return false;
4884 ObjectOffset2 /= Scale2;
4885 ObjectOffset1 += Offset1;
4886 ObjectOffset2 += Offset2;
4887 return ObjectOffset1 + 1 == ObjectOffset2;
4888 }
4889
4890 return FI1 == FI2;
4891 }
4892
4893 /// Detect opportunities for ldp/stp formation.
4894 ///
4895 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4896 bool AArch64InstrInfo::shouldClusterMemOps(
4897 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4898 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4899 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4900 unsigned NumBytes) const {
4901 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4902 const MachineOperand &BaseOp1 = *BaseOps1.front();
4903 const MachineOperand &BaseOp2 = *BaseOps2.front();
4904 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4905 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4906 if (BaseOp1.getType() != BaseOp2.getType())
4907 return false;
4908
4909 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4910 "Only base registers and frame indices are supported.");
4911
4912 // Check for both base regs and base FI.
4913 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4914 return false;
4915
4916 // Only cluster up to a single pair.
4917 if (ClusterSize > 2)
4918 return false;
4919
4920 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4921 return false;
4922
4923 // Can we pair these instructions based on their opcodes?
4924 unsigned FirstOpc = FirstLdSt.getOpcode();
4925 unsigned SecondOpc = SecondLdSt.getOpcode();
4926 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4927 return false;
4928
4929 // Can't merge volatiles or load/stores that have a hint to avoid pair
4930 // formation, for example.
4931 if (!isCandidateToMergeOrPair(FirstLdSt) ||
4932 !isCandidateToMergeOrPair(SecondLdSt))
4933 return false;
4934
4935 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4936 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4937 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4938 return false;
4939
4940 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4941 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4942 return false;
4943
4944 // Pairwise instructions have a 7-bit signed offset field.
4945 if (Offset1 > 63 || Offset1 < -64)
4946 return false;
4947
4948 // The caller should already have ordered First/SecondLdSt by offset.
4949 // Note: except for non-equal frame index bases
4950 if (BaseOp1.isFI()) {
4951 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4952 "Caller should have ordered offsets.");
4953
4954 const MachineFrameInfo &MFI =
4955 FirstLdSt.getParent()->getParent()->getFrameInfo();
4956 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4957 BaseOp2.getIndex(), Offset2, SecondOpc);
4958 }
4959
4960 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4961
4962 return Offset1 + 1 == Offset2;
4963 }
4964
AddSubReg(const MachineInstrBuilder & MIB,MCRegister Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4965 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4966 MCRegister Reg, unsigned SubIdx,
4967 unsigned State,
4968 const TargetRegisterInfo *TRI) {
4969 if (!SubIdx)
4970 return MIB.addReg(Reg, State);
4971
4972 if (Reg.isPhysical())
4973 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4974 return MIB.addReg(Reg, State, SubIdx);
4975 }
4976
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4977 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4978 unsigned NumRegs) {
4979 // We really want the positive remainder mod 32 here, that happens to be
4980 // easily obtainable with a mask.
4981 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4982 }
4983
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4984 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4985 MachineBasicBlock::iterator I,
4986 const DebugLoc &DL, MCRegister DestReg,
4987 MCRegister SrcReg, bool KillSrc,
4988 unsigned Opcode,
4989 ArrayRef<unsigned> Indices) const {
4990 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4991 const TargetRegisterInfo *TRI = &getRegisterInfo();
4992 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4993 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4994 unsigned NumRegs = Indices.size();
4995
4996 int SubReg = 0, End = NumRegs, Incr = 1;
4997 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4998 SubReg = NumRegs - 1;
4999 End = -1;
5000 Incr = -1;
5001 }
5002
5003 for (; SubReg != End; SubReg += Incr) {
5004 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5005 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5006 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
5007 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5008 }
5009 }
5010
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const5011 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
5012 MachineBasicBlock::iterator I,
5013 const DebugLoc &DL, MCRegister DestReg,
5014 MCRegister SrcReg, bool KillSrc,
5015 unsigned Opcode, unsigned ZeroReg,
5016 llvm::ArrayRef<unsigned> Indices) const {
5017 const TargetRegisterInfo *TRI = &getRegisterInfo();
5018 unsigned NumRegs = Indices.size();
5019
5020 #ifndef NDEBUG
5021 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
5022 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
5023 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
5024 "GPR reg sequences should not be able to overlap");
5025 #endif
5026
5027 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
5028 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
5029 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
5030 MIB.addReg(ZeroReg);
5031 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
5032 MIB.addImm(0);
5033 }
5034 }
5035
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DestReg,Register SrcReg,bool KillSrc,bool RenamableDest,bool RenamableSrc) const5036 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
5037 MachineBasicBlock::iterator I,
5038 const DebugLoc &DL, Register DestReg,
5039 Register SrcReg, bool KillSrc,
5040 bool RenamableDest,
5041 bool RenamableSrc) const {
5042 if (AArch64::GPR32spRegClass.contains(DestReg) &&
5043 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
5044 const TargetRegisterInfo *TRI = &getRegisterInfo();
5045
5046 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
5047 // If either operand is WSP, expand to ADD #0.
5048 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5049 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5050 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
5051 MCRegister DestRegX = TRI->getMatchingSuperReg(
5052 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5053 MCRegister SrcRegX = TRI->getMatchingSuperReg(
5054 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5055 // This instruction is reading and writing X registers. This may upset
5056 // the register scavenger and machine verifier, so we need to indicate
5057 // that we are reading an undefined value from SrcRegX, but a proper
5058 // value from SrcReg.
5059 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
5060 .addReg(SrcRegX, RegState::Undef)
5061 .addImm(0)
5062 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
5063 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5064 } else {
5065 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
5066 .addReg(SrcReg, getKillRegState(KillSrc))
5067 .addImm(0)
5068 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5069 }
5070 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
5071 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
5072 .addImm(0)
5073 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5074 } else {
5075 if (Subtarget.hasZeroCycleRegMoveGPR64() &&
5076 !Subtarget.hasZeroCycleRegMoveGPR32()) {
5077 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
5078 MCRegister DestRegX = TRI->getMatchingSuperReg(
5079 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5080 MCRegister SrcRegX = TRI->getMatchingSuperReg(
5081 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
5082 // This instruction is reading and writing X registers. This may upset
5083 // the register scavenger and machine verifier, so we need to indicate
5084 // that we are reading an undefined value from SrcRegX, but a proper
5085 // value from SrcReg.
5086 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
5087 .addReg(AArch64::XZR)
5088 .addReg(SrcRegX, RegState::Undef)
5089 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5090 } else {
5091 // Otherwise, expand to ORR WZR.
5092 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
5093 .addReg(AArch64::WZR)
5094 .addReg(SrcReg, getKillRegState(KillSrc));
5095 }
5096 }
5097 return;
5098 }
5099
5100 // Copy a Predicate register by ORRing with itself.
5101 if (AArch64::PPRRegClass.contains(DestReg) &&
5102 AArch64::PPRRegClass.contains(SrcReg)) {
5103 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104 "Unexpected SVE register.");
5105 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
5106 .addReg(SrcReg) // Pg
5107 .addReg(SrcReg)
5108 .addReg(SrcReg, getKillRegState(KillSrc));
5109 return;
5110 }
5111
5112 // Copy a predicate-as-counter register by ORRing with itself as if it
5113 // were a regular predicate (mask) register.
5114 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
5115 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
5116 if (DestIsPNR || SrcIsPNR) {
5117 auto ToPPR = [](MCRegister R) -> MCRegister {
5118 return (R - AArch64::PN0) + AArch64::P0;
5119 };
5120 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg.asMCReg();
5121 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg.asMCReg();
5122
5123 if (PPRSrcReg != PPRDestReg) {
5124 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
5125 .addReg(PPRSrcReg) // Pg
5126 .addReg(PPRSrcReg)
5127 .addReg(PPRSrcReg, getKillRegState(KillSrc));
5128 if (DestIsPNR)
5129 NewMI.addDef(DestReg, RegState::Implicit);
5130 }
5131 return;
5132 }
5133
5134 // Copy a Z register by ORRing with itself.
5135 if (AArch64::ZPRRegClass.contains(DestReg) &&
5136 AArch64::ZPRRegClass.contains(SrcReg)) {
5137 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5138 "Unexpected SVE register.");
5139 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
5140 .addReg(SrcReg)
5141 .addReg(SrcReg, getKillRegState(KillSrc));
5142 return;
5143 }
5144
5145 // Copy a Z register pair by copying the individual sub-registers.
5146 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
5147 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
5148 (AArch64::ZPR2RegClass.contains(SrcReg) ||
5149 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
5150 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5151 "Unexpected SVE register.");
5152 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
5153 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5154 Indices);
5155 return;
5156 }
5157
5158 // Copy a Z register triple by copying the individual sub-registers.
5159 if (AArch64::ZPR3RegClass.contains(DestReg) &&
5160 AArch64::ZPR3RegClass.contains(SrcReg)) {
5161 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5162 "Unexpected SVE register.");
5163 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5164 AArch64::zsub2};
5165 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5166 Indices);
5167 return;
5168 }
5169
5170 // Copy a Z register quad by copying the individual sub-registers.
5171 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
5172 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
5173 (AArch64::ZPR4RegClass.contains(SrcReg) ||
5174 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
5175 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5176 "Unexpected SVE register.");
5177 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
5178 AArch64::zsub2, AArch64::zsub3};
5179 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
5180 Indices);
5181 return;
5182 }
5183
5184 if (AArch64::GPR64spRegClass.contains(DestReg) &&
5185 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
5186 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
5187 // If either operand is SP, expand to ADD #0.
5188 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
5189 .addReg(SrcReg, getKillRegState(KillSrc))
5190 .addImm(0)
5191 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5192 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5193 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
5194 .addImm(0)
5195 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5196 } else {
5197 // Otherwise, expand to ORR XZR.
5198 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
5199 .addReg(AArch64::XZR)
5200 .addReg(SrcReg, getKillRegState(KillSrc));
5201 }
5202 return;
5203 }
5204
5205 // Copy a DDDD register quad by copying the individual sub-registers.
5206 if (AArch64::DDDDRegClass.contains(DestReg) &&
5207 AArch64::DDDDRegClass.contains(SrcReg)) {
5208 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5209 AArch64::dsub2, AArch64::dsub3};
5210 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5211 Indices);
5212 return;
5213 }
5214
5215 // Copy a DDD register triple by copying the individual sub-registers.
5216 if (AArch64::DDDRegClass.contains(DestReg) &&
5217 AArch64::DDDRegClass.contains(SrcReg)) {
5218 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
5219 AArch64::dsub2};
5220 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5221 Indices);
5222 return;
5223 }
5224
5225 // Copy a DD register pair by copying the individual sub-registers.
5226 if (AArch64::DDRegClass.contains(DestReg) &&
5227 AArch64::DDRegClass.contains(SrcReg)) {
5228 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
5229 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
5230 Indices);
5231 return;
5232 }
5233
5234 // Copy a QQQQ register quad by copying the individual sub-registers.
5235 if (AArch64::QQQQRegClass.contains(DestReg) &&
5236 AArch64::QQQQRegClass.contains(SrcReg)) {
5237 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5238 AArch64::qsub2, AArch64::qsub3};
5239 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5240 Indices);
5241 return;
5242 }
5243
5244 // Copy a QQQ register triple by copying the individual sub-registers.
5245 if (AArch64::QQQRegClass.contains(DestReg) &&
5246 AArch64::QQQRegClass.contains(SrcReg)) {
5247 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
5248 AArch64::qsub2};
5249 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5250 Indices);
5251 return;
5252 }
5253
5254 // Copy a QQ register pair by copying the individual sub-registers.
5255 if (AArch64::QQRegClass.contains(DestReg) &&
5256 AArch64::QQRegClass.contains(SrcReg)) {
5257 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
5258 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
5259 Indices);
5260 return;
5261 }
5262
5263 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
5264 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
5265 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
5266 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
5267 AArch64::XZR, Indices);
5268 return;
5269 }
5270
5271 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
5272 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
5273 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
5274 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
5275 AArch64::WZR, Indices);
5276 return;
5277 }
5278
5279 if (AArch64::FPR128RegClass.contains(DestReg) &&
5280 AArch64::FPR128RegClass.contains(SrcReg)) {
5281 if (Subtarget.isSVEorStreamingSVEAvailable() &&
5282 !Subtarget.isNeonAvailable())
5283 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
5284 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
5285 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
5286 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
5287 else if (Subtarget.isNeonAvailable())
5288 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
5289 .addReg(SrcReg)
5290 .addReg(SrcReg, getKillRegState(KillSrc));
5291 else {
5292 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
5293 .addReg(AArch64::SP, RegState::Define)
5294 .addReg(SrcReg, getKillRegState(KillSrc))
5295 .addReg(AArch64::SP)
5296 .addImm(-16);
5297 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
5298 .addReg(AArch64::SP, RegState::Define)
5299 .addReg(DestReg, RegState::Define)
5300 .addReg(AArch64::SP)
5301 .addImm(16);
5302 }
5303 return;
5304 }
5305
5306 if (AArch64::FPR64RegClass.contains(DestReg) &&
5307 AArch64::FPR64RegClass.contains(SrcReg)) {
5308 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
5309 .addReg(SrcReg, getKillRegState(KillSrc));
5310 return;
5311 }
5312
5313 if (AArch64::FPR32RegClass.contains(DestReg) &&
5314 AArch64::FPR32RegClass.contains(SrcReg)) {
5315 if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5316 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5317 const TargetRegisterInfo *TRI = &getRegisterInfo();
5318 MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
5319 &AArch64::FPR64RegClass);
5320 MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
5321 &AArch64::FPR64RegClass);
5322 // This instruction is reading and writing D registers. This may upset
5323 // the register scavenger and machine verifier, so we need to indicate
5324 // that we are reading an undefined value from SrcRegD, but a proper
5325 // value from SrcReg.
5326 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5327 .addReg(SrcRegD, RegState::Undef)
5328 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5329 } else {
5330 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5331 .addReg(SrcReg, getKillRegState(KillSrc));
5332 }
5333 return;
5334 }
5335
5336 if (AArch64::FPR16RegClass.contains(DestReg) &&
5337 AArch64::FPR16RegClass.contains(SrcReg)) {
5338 if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5339 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5340 const TargetRegisterInfo *TRI = &getRegisterInfo();
5341 MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
5342 &AArch64::FPR64RegClass);
5343 MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
5344 &AArch64::FPR64RegClass);
5345 // This instruction is reading and writing D registers. This may upset
5346 // the register scavenger and machine verifier, so we need to indicate
5347 // that we are reading an undefined value from SrcRegD, but a proper
5348 // value from SrcReg.
5349 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5350 .addReg(SrcRegD, RegState::Undef)
5351 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5352 } else {
5353 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5354 &AArch64::FPR32RegClass);
5355 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5356 &AArch64::FPR32RegClass);
5357 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5358 .addReg(SrcReg, getKillRegState(KillSrc));
5359 }
5360 return;
5361 }
5362
5363 if (AArch64::FPR8RegClass.contains(DestReg) &&
5364 AArch64::FPR8RegClass.contains(SrcReg)) {
5365 if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5366 !Subtarget.hasZeroCycleRegMoveFPR32()) {
5367 const TargetRegisterInfo *TRI = &getRegisterInfo();
5368 MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
5369 &AArch64::FPR64RegClass);
5370 MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
5371 &AArch64::FPR64RegClass);
5372 // This instruction is reading and writing D registers. This may upset
5373 // the register scavenger and machine verifier, so we need to indicate
5374 // that we are reading an undefined value from SrcRegD, but a proper
5375 // value from SrcReg.
5376 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5377 .addReg(SrcRegD, RegState::Undef)
5378 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5379 } else {
5380 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5381 &AArch64::FPR32RegClass);
5382 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5383 &AArch64::FPR32RegClass);
5384 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5385 .addReg(SrcReg, getKillRegState(KillSrc));
5386 }
5387 return;
5388 }
5389
5390 // Copies between GPR64 and FPR64.
5391 if (AArch64::FPR64RegClass.contains(DestReg) &&
5392 AArch64::GPR64RegClass.contains(SrcReg)) {
5393 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
5394 .addReg(SrcReg, getKillRegState(KillSrc));
5395 return;
5396 }
5397 if (AArch64::GPR64RegClass.contains(DestReg) &&
5398 AArch64::FPR64RegClass.contains(SrcReg)) {
5399 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
5400 .addReg(SrcReg, getKillRegState(KillSrc));
5401 return;
5402 }
5403 // Copies between GPR32 and FPR32.
5404 if (AArch64::FPR32RegClass.contains(DestReg) &&
5405 AArch64::GPR32RegClass.contains(SrcReg)) {
5406 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
5407 .addReg(SrcReg, getKillRegState(KillSrc));
5408 return;
5409 }
5410 if (AArch64::GPR32RegClass.contains(DestReg) &&
5411 AArch64::FPR32RegClass.contains(SrcReg)) {
5412 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
5413 .addReg(SrcReg, getKillRegState(KillSrc));
5414 return;
5415 }
5416
5417 if (DestReg == AArch64::NZCV) {
5418 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
5419 BuildMI(MBB, I, DL, get(AArch64::MSR))
5420 .addImm(AArch64SysReg::NZCV)
5421 .addReg(SrcReg, getKillRegState(KillSrc))
5422 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
5423 return;
5424 }
5425
5426 if (SrcReg == AArch64::NZCV) {
5427 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
5428 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
5429 .addImm(AArch64SysReg::NZCV)
5430 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
5431 return;
5432 }
5433
5434 #ifndef NDEBUG
5435 const TargetRegisterInfo &TRI = getRegisterInfo();
5436 errs() << TRI.getRegAsmName(DestReg) << " = COPY "
5437 << TRI.getRegAsmName(SrcReg) << "\n";
5438 #endif
5439 llvm_unreachable("unimplemented reg-to-reg copy");
5440 }
5441
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)5442 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
5443 MachineBasicBlock &MBB,
5444 MachineBasicBlock::iterator InsertBefore,
5445 const MCInstrDesc &MCID,
5446 Register SrcReg, bool IsKill,
5447 unsigned SubIdx0, unsigned SubIdx1, int FI,
5448 MachineMemOperand *MMO) {
5449 Register SrcReg0 = SrcReg;
5450 Register SrcReg1 = SrcReg;
5451 if (SrcReg.isPhysical()) {
5452 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
5453 SubIdx0 = 0;
5454 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
5455 SubIdx1 = 0;
5456 }
5457 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5458 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
5459 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
5460 .addFrameIndex(FI)
5461 .addImm(0)
5462 .addMemOperand(MMO);
5463 }
5464
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const5465 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
5466 MachineBasicBlock::iterator MBBI,
5467 Register SrcReg, bool isKill, int FI,
5468 const TargetRegisterClass *RC,
5469 const TargetRegisterInfo *TRI,
5470 Register VReg,
5471 MachineInstr::MIFlag Flags) const {
5472 MachineFunction &MF = *MBB.getParent();
5473 MachineFrameInfo &MFI = MF.getFrameInfo();
5474
5475 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5476 MachineMemOperand *MMO =
5477 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
5478 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5479 unsigned Opc = 0;
5480 bool Offset = true;
5481 MCRegister PNRReg = MCRegister::NoRegister;
5482 unsigned StackID = TargetStackID::Default;
5483 switch (TRI->getSpillSize(*RC)) {
5484 case 1:
5485 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5486 Opc = AArch64::STRBui;
5487 break;
5488 case 2: {
5489 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5490 Opc = AArch64::STRHui;
5491 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
5492 AArch64::PPRRegClass.hasSubClassEq(RC)) {
5493 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5494 "Unexpected register store without SVE store instructions");
5495 Opc = AArch64::STR_PXI;
5496 StackID = TargetStackID::ScalableVector;
5497 }
5498 break;
5499 }
5500 case 4:
5501 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5502 Opc = AArch64::STRWui;
5503 if (SrcReg.isVirtual())
5504 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
5505 else
5506 assert(SrcReg != AArch64::WSP);
5507 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5508 Opc = AArch64::STRSui;
5509 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5510 Opc = AArch64::STR_PPXI;
5511 StackID = TargetStackID::ScalableVector;
5512 }
5513 break;
5514 case 8:
5515 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5516 Opc = AArch64::STRXui;
5517 if (SrcReg.isVirtual())
5518 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5519 else
5520 assert(SrcReg != AArch64::SP);
5521 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5522 Opc = AArch64::STRDui;
5523 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5524 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5525 get(AArch64::STPWi), SrcReg, isKill,
5526 AArch64::sube32, AArch64::subo32, FI, MMO);
5527 return;
5528 }
5529 break;
5530 case 16:
5531 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5532 Opc = AArch64::STRQui;
5533 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5534 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5535 Opc = AArch64::ST1Twov1d;
5536 Offset = false;
5537 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5538 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
5539 get(AArch64::STPXi), SrcReg, isKill,
5540 AArch64::sube64, AArch64::subo64, FI, MMO);
5541 return;
5542 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5543 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5544 "Unexpected register store without SVE store instructions");
5545 Opc = AArch64::STR_ZXI;
5546 StackID = TargetStackID::ScalableVector;
5547 } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
5548 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5549 "Unexpected predicate store without SVE store instructions");
5550 Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
5551 StackID = TargetStackID::ScalableVector;
5552 }
5553 break;
5554 case 24:
5555 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5556 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5557 Opc = AArch64::ST1Threev1d;
5558 Offset = false;
5559 }
5560 break;
5561 case 32:
5562 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5563 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5564 Opc = AArch64::ST1Fourv1d;
5565 Offset = false;
5566 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5567 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5568 Opc = AArch64::ST1Twov2d;
5569 Offset = false;
5570 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5571 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5572 "Unexpected register store without SVE store instructions");
5573 Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
5574 StackID = TargetStackID::ScalableVector;
5575 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
5576 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5577 "Unexpected register store without SVE store instructions");
5578 Opc = AArch64::STR_ZZXI;
5579 StackID = TargetStackID::ScalableVector;
5580 }
5581 break;
5582 case 48:
5583 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5584 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5585 Opc = AArch64::ST1Threev2d;
5586 Offset = false;
5587 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5588 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5589 "Unexpected register store without SVE store instructions");
5590 Opc = AArch64::STR_ZZZXI;
5591 StackID = TargetStackID::ScalableVector;
5592 }
5593 break;
5594 case 64:
5595 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5596 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
5597 Opc = AArch64::ST1Fourv2d;
5598 Offset = false;
5599 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5600 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5601 "Unexpected register store without SVE store instructions");
5602 Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
5603 StackID = TargetStackID::ScalableVector;
5604 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
5605 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5606 "Unexpected register store without SVE store instructions");
5607 Opc = AArch64::STR_ZZZZXI;
5608 StackID = TargetStackID::ScalableVector;
5609 }
5610 break;
5611 }
5612 assert(Opc && "Unknown register class");
5613 MFI.setStackID(FI, StackID);
5614
5615 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5616 .addReg(SrcReg, getKillRegState(isKill))
5617 .addFrameIndex(FI);
5618
5619 if (Offset)
5620 MI.addImm(0);
5621 if (PNRReg.isValid())
5622 MI.addDef(PNRReg, RegState::Implicit);
5623 MI.addMemOperand(MMO);
5624 }
5625
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)5626 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
5627 MachineBasicBlock &MBB,
5628 MachineBasicBlock::iterator InsertBefore,
5629 const MCInstrDesc &MCID,
5630 Register DestReg, unsigned SubIdx0,
5631 unsigned SubIdx1, int FI,
5632 MachineMemOperand *MMO) {
5633 Register DestReg0 = DestReg;
5634 Register DestReg1 = DestReg;
5635 bool IsUndef = true;
5636 if (DestReg.isPhysical()) {
5637 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
5638 SubIdx0 = 0;
5639 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
5640 SubIdx1 = 0;
5641 IsUndef = false;
5642 }
5643 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
5644 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
5645 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
5646 .addFrameIndex(FI)
5647 .addImm(0)
5648 .addMemOperand(MMO);
5649 }
5650
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg,MachineInstr::MIFlag Flags) const5651 void AArch64InstrInfo::loadRegFromStackSlot(
5652 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
5653 int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
5654 Register VReg, MachineInstr::MIFlag Flags) const {
5655 MachineFunction &MF = *MBB.getParent();
5656 MachineFrameInfo &MFI = MF.getFrameInfo();
5657 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5658 MachineMemOperand *MMO =
5659 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5660 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5661
5662 unsigned Opc = 0;
5663 bool Offset = true;
5664 unsigned StackID = TargetStackID::Default;
5665 Register PNRReg = MCRegister::NoRegister;
5666 switch (TRI->getSpillSize(*RC)) {
5667 case 1:
5668 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5669 Opc = AArch64::LDRBui;
5670 break;
5671 case 2: {
5672 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5673 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5674 Opc = AArch64::LDRHui;
5675 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5676 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5677 "Unexpected register load without SVE load instructions");
5678 if (IsPNR)
5679 PNRReg = DestReg;
5680 Opc = AArch64::LDR_PXI;
5681 StackID = TargetStackID::ScalableVector;
5682 }
5683 break;
5684 }
5685 case 4:
5686 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5687 Opc = AArch64::LDRWui;
5688 if (DestReg.isVirtual())
5689 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5690 else
5691 assert(DestReg != AArch64::WSP);
5692 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5693 Opc = AArch64::LDRSui;
5694 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5695 Opc = AArch64::LDR_PPXI;
5696 StackID = TargetStackID::ScalableVector;
5697 }
5698 break;
5699 case 8:
5700 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5701 Opc = AArch64::LDRXui;
5702 if (DestReg.isVirtual())
5703 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5704 else
5705 assert(DestReg != AArch64::SP);
5706 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5707 Opc = AArch64::LDRDui;
5708 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5709 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5710 get(AArch64::LDPWi), DestReg, AArch64::sube32,
5711 AArch64::subo32, FI, MMO);
5712 return;
5713 }
5714 break;
5715 case 16:
5716 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5717 Opc = AArch64::LDRQui;
5718 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5719 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5720 Opc = AArch64::LD1Twov1d;
5721 Offset = false;
5722 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5723 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5724 get(AArch64::LDPXi), DestReg, AArch64::sube64,
5725 AArch64::subo64, FI, MMO);
5726 return;
5727 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5728 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5729 "Unexpected register load without SVE load instructions");
5730 Opc = AArch64::LDR_ZXI;
5731 StackID = TargetStackID::ScalableVector;
5732 } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
5733 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5734 "Unexpected predicate load without SVE load instructions");
5735 Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
5736 StackID = TargetStackID::ScalableVector;
5737 }
5738 break;
5739 case 24:
5740 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5741 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5742 Opc = AArch64::LD1Threev1d;
5743 Offset = false;
5744 }
5745 break;
5746 case 32:
5747 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5748 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5749 Opc = AArch64::LD1Fourv1d;
5750 Offset = false;
5751 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5752 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5753 Opc = AArch64::LD1Twov2d;
5754 Offset = false;
5755 } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5756 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5757 "Unexpected register load without SVE load instructions");
5758 Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
5759 StackID = TargetStackID::ScalableVector;
5760 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
5761 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5762 "Unexpected register load without SVE load instructions");
5763 Opc = AArch64::LDR_ZZXI;
5764 StackID = TargetStackID::ScalableVector;
5765 }
5766 break;
5767 case 48:
5768 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5769 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5770 Opc = AArch64::LD1Threev2d;
5771 Offset = false;
5772 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5773 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5774 "Unexpected register load without SVE load instructions");
5775 Opc = AArch64::LDR_ZZZXI;
5776 StackID = TargetStackID::ScalableVector;
5777 }
5778 break;
5779 case 64:
5780 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5781 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5782 Opc = AArch64::LD1Fourv2d;
5783 Offset = false;
5784 } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5785 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5786 "Unexpected register load without SVE load instructions");
5787 Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
5788 StackID = TargetStackID::ScalableVector;
5789 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
5790 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5791 "Unexpected register load without SVE load instructions");
5792 Opc = AArch64::LDR_ZZZZXI;
5793 StackID = TargetStackID::ScalableVector;
5794 }
5795 break;
5796 }
5797
5798 assert(Opc && "Unknown register class");
5799 MFI.setStackID(FI, StackID);
5800
5801 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5802 .addReg(DestReg, getDefRegState(true))
5803 .addFrameIndex(FI);
5804 if (Offset)
5805 MI.addImm(0);
5806 if (PNRReg.isValid() && !PNRReg.isVirtual())
5807 MI.addDef(PNRReg, RegState::Implicit);
5808 MI.addMemOperand(MMO);
5809 }
5810
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5811 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5812 const MachineInstr &UseMI,
5813 const TargetRegisterInfo *TRI) {
5814 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5815 UseMI.getIterator()),
5816 [TRI](const MachineInstr &I) {
5817 return I.modifiesRegister(AArch64::NZCV, TRI) ||
5818 I.readsRegister(AArch64::NZCV, TRI);
5819 });
5820 }
5821
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5822 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5823 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5824 // The smallest scalable element supported by scaled SVE addressing
5825 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5826 // byte offset must always be a multiple of 2.
5827 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5828
5829 // VGSized offsets are divided by '2', because the VG register is the
5830 // the number of 64bit granules as opposed to 128bit vector chunks,
5831 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5832 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5833 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5834 ByteSized = Offset.getFixed();
5835 VGSized = Offset.getScalable() / 2;
5836 }
5837
5838 /// Returns the offset in parts to which this frame offset can be
5839 /// decomposed for the purpose of describing a frame offset.
5840 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5841 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5842 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5843 int64_t &NumDataVectors) {
5844 // The smallest scalable element supported by scaled SVE addressing
5845 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5846 // byte offset must always be a multiple of 2.
5847 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5848
5849 NumBytes = Offset.getFixed();
5850 NumDataVectors = 0;
5851 NumPredicateVectors = Offset.getScalable() / 2;
5852 // This method is used to get the offsets to adjust the frame offset.
5853 // If the function requires ADDPL to be used and needs more than two ADDPL
5854 // instructions, part of the offset is folded into NumDataVectors so that it
5855 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5856 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5857 NumPredicateVectors > 62) {
5858 NumDataVectors = NumPredicateVectors / 8;
5859 NumPredicateVectors -= NumDataVectors * 8;
5860 }
5861 }
5862
5863 // Convenience function to create a DWARF expression for
5864 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5865 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5866 int NumVGScaledBytes, unsigned VG,
5867 llvm::raw_string_ostream &Comment) {
5868 uint8_t buffer[16];
5869
5870 if (NumBytes) {
5871 Expr.push_back(dwarf::DW_OP_consts);
5872 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5873 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5874 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5875 }
5876
5877 if (NumVGScaledBytes) {
5878 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5879 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5880
5881 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5882 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5883 Expr.push_back(0);
5884
5885 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5886 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5887
5888 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5889 << std::abs(NumVGScaledBytes) << " * VG";
5890 }
5891 }
5892
5893 // Creates an MCCFIInstruction:
5894 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5895 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5896 unsigned Reg,
5897 const StackOffset &Offset) {
5898 int64_t NumBytes, NumVGScaledBytes;
5899 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5900 NumVGScaledBytes);
5901 std::string CommentBuffer;
5902 llvm::raw_string_ostream Comment(CommentBuffer);
5903
5904 if (Reg == AArch64::SP)
5905 Comment << "sp";
5906 else if (Reg == AArch64::FP)
5907 Comment << "fp";
5908 else
5909 Comment << printReg(Reg, &TRI);
5910
5911 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5912 SmallString<64> Expr;
5913 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5914 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5915 Expr.push_back(0);
5916 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5917 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5918
5919 // Wrap this into DW_CFA_def_cfa.
5920 SmallString<64> DefCfaExpr;
5921 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5922 uint8_t buffer[16];
5923 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5924 DefCfaExpr.append(Expr.str());
5925 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5926 Comment.str());
5927 }
5928
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5929 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5930 unsigned FrameReg, unsigned Reg,
5931 const StackOffset &Offset,
5932 bool LastAdjustmentWasScalable) {
5933 if (Offset.getScalable())
5934 return createDefCFAExpression(TRI, Reg, Offset);
5935
5936 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5937 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5938
5939 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5940 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5941 }
5942
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5943 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5944 unsigned Reg,
5945 const StackOffset &OffsetFromDefCFA) {
5946 int64_t NumBytes, NumVGScaledBytes;
5947 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5948 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5949
5950 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5951
5952 // Non-scalable offsets can use DW_CFA_offset directly.
5953 if (!NumVGScaledBytes)
5954 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5955
5956 std::string CommentBuffer;
5957 llvm::raw_string_ostream Comment(CommentBuffer);
5958 Comment << printReg(Reg, &TRI) << " @ cfa";
5959
5960 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5961 SmallString<64> OffsetExpr;
5962 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5963 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5964
5965 // Wrap this into DW_CFA_expression
5966 SmallString<64> CfaExpr;
5967 CfaExpr.push_back(dwarf::DW_CFA_expression);
5968 uint8_t buffer[16];
5969 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5970 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5971 CfaExpr.append(OffsetExpr.str());
5972
5973 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5974 Comment.str());
5975 }
5976
5977 // Helper function to emit a frame offset adjustment from a given
5978 // pointer (SrcReg), stored into DestReg. This function is explicit
5979 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5980 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5981 MachineBasicBlock::iterator MBBI,
5982 const DebugLoc &DL, unsigned DestReg,
5983 unsigned SrcReg, int64_t Offset, unsigned Opc,
5984 const TargetInstrInfo *TII,
5985 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5986 bool *HasWinCFI, bool EmitCFAOffset,
5987 StackOffset CFAOffset, unsigned FrameReg) {
5988 int Sign = 1;
5989 unsigned MaxEncoding, ShiftSize;
5990 switch (Opc) {
5991 case AArch64::ADDXri:
5992 case AArch64::ADDSXri:
5993 case AArch64::SUBXri:
5994 case AArch64::SUBSXri:
5995 MaxEncoding = 0xfff;
5996 ShiftSize = 12;
5997 break;
5998 case AArch64::ADDVL_XXI:
5999 case AArch64::ADDPL_XXI:
6000 case AArch64::ADDSVL_XXI:
6001 case AArch64::ADDSPL_XXI:
6002 MaxEncoding = 31;
6003 ShiftSize = 0;
6004 if (Offset < 0) {
6005 MaxEncoding = 32;
6006 Sign = -1;
6007 Offset = -Offset;
6008 }
6009 break;
6010 default:
6011 llvm_unreachable("Unsupported opcode");
6012 }
6013
6014 // `Offset` can be in bytes or in "scalable bytes".
6015 int VScale = 1;
6016 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
6017 VScale = 16;
6018 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
6019 VScale = 2;
6020
6021 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
6022 // scratch register. If DestReg is a virtual register, use it as the
6023 // scratch register; otherwise, create a new virtual register (to be
6024 // replaced by the scavenger at the end of PEI). That case can be optimized
6025 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
6026 // register can be loaded with offset%8 and the add/sub can use an extending
6027 // instruction with LSL#3.
6028 // Currently the function handles any offsets but generates a poor sequence
6029 // of code.
6030 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
6031
6032 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
6033 Register TmpReg = DestReg;
6034 if (TmpReg == AArch64::XZR)
6035 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
6036 &AArch64::GPR64RegClass);
6037 do {
6038 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
6039 unsigned LocalShiftSize = 0;
6040 if (ThisVal > MaxEncoding) {
6041 ThisVal = ThisVal >> ShiftSize;
6042 LocalShiftSize = ShiftSize;
6043 }
6044 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
6045 "Encoding cannot handle value that big");
6046
6047 Offset -= ThisVal << LocalShiftSize;
6048 if (Offset == 0)
6049 TmpReg = DestReg;
6050 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
6051 .addReg(SrcReg)
6052 .addImm(Sign * (int)ThisVal);
6053 if (ShiftSize)
6054 MBI = MBI.addImm(
6055 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
6056 MBI = MBI.setMIFlag(Flag);
6057
6058 auto Change =
6059 VScale == 1
6060 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
6061 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
6062 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
6063 CFAOffset += Change;
6064 else
6065 CFAOffset -= Change;
6066 if (EmitCFAOffset && DestReg == TmpReg) {
6067 MachineFunction &MF = *MBB.getParent();
6068 const TargetSubtargetInfo &STI = MF.getSubtarget();
6069 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
6070
6071 unsigned CFIIndex = MF.addFrameInst(
6072 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
6073 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
6074 .addCFIIndex(CFIIndex)
6075 .setMIFlags(Flag);
6076 }
6077
6078 if (NeedsWinCFI) {
6079 int Imm = (int)(ThisVal << LocalShiftSize);
6080 if (VScale != 1 && DestReg == AArch64::SP) {
6081 if (HasWinCFI)
6082 *HasWinCFI = true;
6083 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AllocZ))
6084 .addImm(ThisVal)
6085 .setMIFlag(Flag);
6086 } else if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
6087 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
6088 assert(VScale == 1 && "Expected non-scalable operation");
6089 if (HasWinCFI)
6090 *HasWinCFI = true;
6091 if (Imm == 0)
6092 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
6093 else
6094 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
6095 .addImm(Imm)
6096 .setMIFlag(Flag);
6097 assert(Offset == 0 && "Expected remaining offset to be zero to "
6098 "emit a single SEH directive");
6099 } else if (DestReg == AArch64::SP) {
6100 assert(VScale == 1 && "Expected non-scalable operation");
6101 if (HasWinCFI)
6102 *HasWinCFI = true;
6103 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
6104 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
6105 .addImm(Imm)
6106 .setMIFlag(Flag);
6107 }
6108 }
6109
6110 SrcReg = TmpReg;
6111 } while (Offset);
6112 }
6113
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)6114 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
6115 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
6116 unsigned DestReg, unsigned SrcReg,
6117 StackOffset Offset, const TargetInstrInfo *TII,
6118 MachineInstr::MIFlag Flag, bool SetNZCV,
6119 bool NeedsWinCFI, bool *HasWinCFI,
6120 bool EmitCFAOffset, StackOffset CFAOffset,
6121 unsigned FrameReg) {
6122 // If a function is marked as arm_locally_streaming, then the runtime value of
6123 // vscale in the prologue/epilogue is different the runtime value of vscale
6124 // in the function's body. To avoid having to consider multiple vscales,
6125 // we can use `addsvl` to allocate any scalable stack-slots, which under
6126 // most circumstances will be only locals, not callee-save slots.
6127 const Function &F = MBB.getParent()->getFunction();
6128 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
6129
6130 int64_t Bytes, NumPredicateVectors, NumDataVectors;
6131 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
6132 Offset, Bytes, NumPredicateVectors, NumDataVectors);
6133
6134 // First emit non-scalable frame offsets, or a simple 'mov'.
6135 if (Bytes || (!Offset && SrcReg != DestReg)) {
6136 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
6137 "SP increment/decrement not 8-byte aligned");
6138 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
6139 if (Bytes < 0) {
6140 Bytes = -Bytes;
6141 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
6142 }
6143 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
6144 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6145 FrameReg);
6146 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
6147 ? StackOffset::getFixed(-Bytes)
6148 : StackOffset::getFixed(Bytes);
6149 SrcReg = DestReg;
6150 FrameReg = DestReg;
6151 }
6152
6153 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
6154 "SetNZCV not supported with SVE vectors");
6155 assert(!(NeedsWinCFI && NumPredicateVectors) &&
6156 "WinCFI can't allocate fractions of an SVE data vector");
6157
6158 if (NumDataVectors) {
6159 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
6160 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, TII,
6161 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6162 FrameReg);
6163 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
6164 SrcReg = DestReg;
6165 }
6166
6167 if (NumPredicateVectors) {
6168 assert(DestReg != AArch64::SP && "Unaligned access to SP");
6169 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
6170 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, TII,
6171 Flag, NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
6172 FrameReg);
6173 }
6174 }
6175
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const6176 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
6177 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
6178 MachineBasicBlock::iterator InsertPt, int FrameIndex,
6179 LiveIntervals *LIS, VirtRegMap *VRM) const {
6180 // This is a bit of a hack. Consider this instruction:
6181 //
6182 // %0 = COPY %sp; GPR64all:%0
6183 //
6184 // We explicitly chose GPR64all for the virtual register so such a copy might
6185 // be eliminated by RegisterCoalescer. However, that may not be possible, and
6186 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
6187 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
6188 //
6189 // To prevent that, we are going to constrain the %0 register class here.
6190 if (MI.isFullCopy()) {
6191 Register DstReg = MI.getOperand(0).getReg();
6192 Register SrcReg = MI.getOperand(1).getReg();
6193 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
6194 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
6195 return nullptr;
6196 }
6197 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
6198 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
6199 return nullptr;
6200 }
6201 // Nothing can folded with copy from/to NZCV.
6202 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
6203 return nullptr;
6204 }
6205
6206 // Handle the case where a copy is being spilled or filled but the source
6207 // and destination register class don't match. For example:
6208 //
6209 // %0 = COPY %xzr; GPR64common:%0
6210 //
6211 // In this case we can still safely fold away the COPY and generate the
6212 // following spill code:
6213 //
6214 // STRXui %xzr, %stack.0
6215 //
6216 // This also eliminates spilled cross register class COPYs (e.g. between x and
6217 // d regs) of the same size. For example:
6218 //
6219 // %0 = COPY %1; GPR64:%0, FPR64:%1
6220 //
6221 // will be filled as
6222 //
6223 // LDRDui %0, fi<#0>
6224 //
6225 // instead of
6226 //
6227 // LDRXui %Temp, fi<#0>
6228 // %0 = FMOV %Temp
6229 //
6230 if (MI.isCopy() && Ops.size() == 1 &&
6231 // Make sure we're only folding the explicit COPY defs/uses.
6232 (Ops[0] == 0 || Ops[0] == 1)) {
6233 bool IsSpill = Ops[0] == 0;
6234 bool IsFill = !IsSpill;
6235 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
6236 const MachineRegisterInfo &MRI = MF.getRegInfo();
6237 MachineBasicBlock &MBB = *MI.getParent();
6238 const MachineOperand &DstMO = MI.getOperand(0);
6239 const MachineOperand &SrcMO = MI.getOperand(1);
6240 Register DstReg = DstMO.getReg();
6241 Register SrcReg = SrcMO.getReg();
6242 // This is slightly expensive to compute for physical regs since
6243 // getMinimalPhysRegClass is slow.
6244 auto getRegClass = [&](unsigned Reg) {
6245 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
6246 : TRI.getMinimalPhysRegClass(Reg);
6247 };
6248
6249 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
6250 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
6251 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
6252 "Mismatched register size in non subreg COPY");
6253 if (IsSpill)
6254 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
6255 getRegClass(SrcReg), &TRI, Register());
6256 else
6257 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
6258 getRegClass(DstReg), &TRI, Register());
6259 return &*--InsertPt;
6260 }
6261
6262 // Handle cases like spilling def of:
6263 //
6264 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
6265 //
6266 // where the physical register source can be widened and stored to the full
6267 // virtual reg destination stack slot, in this case producing:
6268 //
6269 // STRXui %xzr, %stack.0
6270 //
6271 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
6272 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
6273 assert(SrcMO.getSubReg() == 0 &&
6274 "Unexpected subreg on physical register");
6275 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
6276 FrameIndex, &AArch64::GPR64RegClass, &TRI,
6277 Register());
6278 return &*--InsertPt;
6279 }
6280
6281 // Handle cases like filling use of:
6282 //
6283 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
6284 //
6285 // where we can load the full virtual reg source stack slot, into the subreg
6286 // destination, in this case producing:
6287 //
6288 // LDRWui %0:sub_32<def,read-undef>, %stack.0
6289 //
6290 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
6291 const TargetRegisterClass *FillRC;
6292 switch (DstMO.getSubReg()) {
6293 default:
6294 FillRC = nullptr;
6295 break;
6296 case AArch64::sub_32:
6297 FillRC = &AArch64::GPR32RegClass;
6298 break;
6299 case AArch64::ssub:
6300 FillRC = &AArch64::FPR32RegClass;
6301 break;
6302 case AArch64::dsub:
6303 FillRC = &AArch64::FPR64RegClass;
6304 break;
6305 }
6306
6307 if (FillRC) {
6308 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
6309 TRI.getRegSizeInBits(*FillRC) &&
6310 "Mismatched regclass size on folded subreg COPY");
6311 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
6312 Register());
6313 MachineInstr &LoadMI = *--InsertPt;
6314 MachineOperand &LoadDst = LoadMI.getOperand(0);
6315 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
6316 LoadDst.setSubReg(DstMO.getSubReg());
6317 LoadDst.setIsUndef();
6318 return &LoadMI;
6319 }
6320 }
6321 }
6322
6323 // Cannot fold.
6324 return nullptr;
6325 }
6326
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)6327 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
6328 StackOffset &SOffset,
6329 bool *OutUseUnscaledOp,
6330 unsigned *OutUnscaledOp,
6331 int64_t *EmittableOffset) {
6332 // Set output values in case of early exit.
6333 if (EmittableOffset)
6334 *EmittableOffset = 0;
6335 if (OutUseUnscaledOp)
6336 *OutUseUnscaledOp = false;
6337 if (OutUnscaledOp)
6338 *OutUnscaledOp = 0;
6339
6340 // Exit early for structured vector spills/fills as they can't take an
6341 // immediate offset.
6342 switch (MI.getOpcode()) {
6343 default:
6344 break;
6345 case AArch64::LD1Rv1d:
6346 case AArch64::LD1Rv2s:
6347 case AArch64::LD1Rv2d:
6348 case AArch64::LD1Rv4h:
6349 case AArch64::LD1Rv4s:
6350 case AArch64::LD1Rv8b:
6351 case AArch64::LD1Rv8h:
6352 case AArch64::LD1Rv16b:
6353 case AArch64::LD1Twov2d:
6354 case AArch64::LD1Threev2d:
6355 case AArch64::LD1Fourv2d:
6356 case AArch64::LD1Twov1d:
6357 case AArch64::LD1Threev1d:
6358 case AArch64::LD1Fourv1d:
6359 case AArch64::ST1Twov2d:
6360 case AArch64::ST1Threev2d:
6361 case AArch64::ST1Fourv2d:
6362 case AArch64::ST1Twov1d:
6363 case AArch64::ST1Threev1d:
6364 case AArch64::ST1Fourv1d:
6365 case AArch64::ST1i8:
6366 case AArch64::ST1i16:
6367 case AArch64::ST1i32:
6368 case AArch64::ST1i64:
6369 case AArch64::IRG:
6370 case AArch64::IRGstack:
6371 case AArch64::STGloop:
6372 case AArch64::STZGloop:
6373 return AArch64FrameOffsetCannotUpdate;
6374 }
6375
6376 // Get the min/max offset and the scale.
6377 TypeSize ScaleValue(0U, false), Width(0U, false);
6378 int64_t MinOff, MaxOff;
6379 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
6380 MaxOff))
6381 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6382
6383 // Construct the complete offset.
6384 bool IsMulVL = ScaleValue.isScalable();
6385 unsigned Scale = ScaleValue.getKnownMinValue();
6386 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
6387
6388 const MachineOperand &ImmOpnd =
6389 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
6390 Offset += ImmOpnd.getImm() * Scale;
6391
6392 // If the offset doesn't match the scale, we rewrite the instruction to
6393 // use the unscaled instruction instead. Likewise, if we have a negative
6394 // offset and there is an unscaled op to use.
6395 std::optional<unsigned> UnscaledOp =
6396 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
6397 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
6398 if (useUnscaledOp &&
6399 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
6400 MaxOff))
6401 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
6402
6403 Scale = ScaleValue.getKnownMinValue();
6404 assert(IsMulVL == ScaleValue.isScalable() &&
6405 "Unscaled opcode has different value for scalable");
6406
6407 int64_t Remainder = Offset % Scale;
6408 assert(!(Remainder && useUnscaledOp) &&
6409 "Cannot have remainder when using unscaled op");
6410
6411 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
6412 int64_t NewOffset = Offset / Scale;
6413 if (MinOff <= NewOffset && NewOffset <= MaxOff)
6414 Offset = Remainder;
6415 else {
6416 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
6417 Offset = Offset - (NewOffset * Scale);
6418 }
6419
6420 if (EmittableOffset)
6421 *EmittableOffset = NewOffset;
6422 if (OutUseUnscaledOp)
6423 *OutUseUnscaledOp = useUnscaledOp;
6424 if (OutUnscaledOp && UnscaledOp)
6425 *OutUnscaledOp = *UnscaledOp;
6426
6427 if (IsMulVL)
6428 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
6429 else
6430 SOffset = StackOffset::get(Offset, SOffset.getScalable());
6431 return AArch64FrameOffsetCanUpdate |
6432 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
6433 }
6434
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)6435 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
6436 unsigned FrameReg, StackOffset &Offset,
6437 const AArch64InstrInfo *TII) {
6438 unsigned Opcode = MI.getOpcode();
6439 unsigned ImmIdx = FrameRegIdx + 1;
6440
6441 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
6442 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
6443 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
6444 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
6445 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
6446 MI.eraseFromParent();
6447 Offset = StackOffset();
6448 return true;
6449 }
6450
6451 int64_t NewOffset;
6452 unsigned UnscaledOp;
6453 bool UseUnscaledOp;
6454 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
6455 &UnscaledOp, &NewOffset);
6456 if (Status & AArch64FrameOffsetCanUpdate) {
6457 if (Status & AArch64FrameOffsetIsLegal)
6458 // Replace the FrameIndex with FrameReg.
6459 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
6460 if (UseUnscaledOp)
6461 MI.setDesc(TII->get(UnscaledOp));
6462
6463 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
6464 return !Offset;
6465 }
6466
6467 return false;
6468 }
6469
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const6470 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
6471 MachineBasicBlock::iterator MI) const {
6472 DebugLoc DL;
6473 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
6474 }
6475
getNop() const6476 MCInst AArch64InstrInfo::getNop() const {
6477 return MCInstBuilder(AArch64::HINT).addImm(0);
6478 }
6479
6480 // AArch64 supports MachineCombiner.
useMachineCombiner() const6481 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
6482
6483 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)6484 static bool isCombineInstrSettingFlag(unsigned Opc) {
6485 switch (Opc) {
6486 case AArch64::ADDSWrr:
6487 case AArch64::ADDSWri:
6488 case AArch64::ADDSXrr:
6489 case AArch64::ADDSXri:
6490 case AArch64::SUBSWrr:
6491 case AArch64::SUBSXrr:
6492 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6493 case AArch64::SUBSWri:
6494 case AArch64::SUBSXri:
6495 return true;
6496 default:
6497 break;
6498 }
6499 return false;
6500 }
6501
6502 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)6503 static bool isCombineInstrCandidate32(unsigned Opc) {
6504 switch (Opc) {
6505 case AArch64::ADDWrr:
6506 case AArch64::ADDWri:
6507 case AArch64::SUBWrr:
6508 case AArch64::ADDSWrr:
6509 case AArch64::ADDSWri:
6510 case AArch64::SUBSWrr:
6511 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6512 case AArch64::SUBWri:
6513 case AArch64::SUBSWri:
6514 return true;
6515 default:
6516 break;
6517 }
6518 return false;
6519 }
6520
6521 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)6522 static bool isCombineInstrCandidate64(unsigned Opc) {
6523 switch (Opc) {
6524 case AArch64::ADDXrr:
6525 case AArch64::ADDXri:
6526 case AArch64::SUBXrr:
6527 case AArch64::ADDSXrr:
6528 case AArch64::ADDSXri:
6529 case AArch64::SUBSXrr:
6530 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
6531 case AArch64::SUBXri:
6532 case AArch64::SUBSXri:
6533 case AArch64::ADDv8i8:
6534 case AArch64::ADDv16i8:
6535 case AArch64::ADDv4i16:
6536 case AArch64::ADDv8i16:
6537 case AArch64::ADDv2i32:
6538 case AArch64::ADDv4i32:
6539 case AArch64::SUBv8i8:
6540 case AArch64::SUBv16i8:
6541 case AArch64::SUBv4i16:
6542 case AArch64::SUBv8i16:
6543 case AArch64::SUBv2i32:
6544 case AArch64::SUBv4i32:
6545 return true;
6546 default:
6547 break;
6548 }
6549 return false;
6550 }
6551
6552 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)6553 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
6554 switch (Inst.getOpcode()) {
6555 default:
6556 break;
6557 case AArch64::FADDHrr:
6558 case AArch64::FADDSrr:
6559 case AArch64::FADDDrr:
6560 case AArch64::FADDv4f16:
6561 case AArch64::FADDv8f16:
6562 case AArch64::FADDv2f32:
6563 case AArch64::FADDv2f64:
6564 case AArch64::FADDv4f32:
6565 case AArch64::FSUBHrr:
6566 case AArch64::FSUBSrr:
6567 case AArch64::FSUBDrr:
6568 case AArch64::FSUBv4f16:
6569 case AArch64::FSUBv8f16:
6570 case AArch64::FSUBv2f32:
6571 case AArch64::FSUBv2f64:
6572 case AArch64::FSUBv4f32:
6573 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
6574 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
6575 // the target options or if FADD/FSUB has the contract fast-math flag.
6576 return Options.UnsafeFPMath ||
6577 Options.AllowFPOpFusion == FPOpFusion::Fast ||
6578 Inst.getFlag(MachineInstr::FmContract);
6579 return true;
6580 }
6581 return false;
6582 }
6583
6584 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)6585 static bool isCombineInstrCandidate(unsigned Opc) {
6586 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
6587 }
6588
6589 //
6590 // Utility routine that checks if \param MO is defined by an
6591 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)6592 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
6593 unsigned CombineOpc, unsigned ZeroReg = 0,
6594 bool CheckZeroReg = false) {
6595 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6596 MachineInstr *MI = nullptr;
6597
6598 if (MO.isReg() && MO.getReg().isVirtual())
6599 MI = MRI.getUniqueVRegDef(MO.getReg());
6600 // And it needs to be in the trace (otherwise, it won't have a depth).
6601 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
6602 return false;
6603 // Must only used by the user we combine with.
6604 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
6605 return false;
6606
6607 if (CheckZeroReg) {
6608 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
6609 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
6610 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
6611 // The third input reg must be zero.
6612 if (MI->getOperand(3).getReg() != ZeroReg)
6613 return false;
6614 }
6615
6616 if (isCombineInstrSettingFlag(CombineOpc) &&
6617 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
6618 return false;
6619
6620 return true;
6621 }
6622
6623 //
6624 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)6625 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6626 unsigned MulOpc, unsigned ZeroReg) {
6627 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
6628 }
6629
6630 //
6631 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)6632 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
6633 unsigned MulOpc) {
6634 return canCombine(MBB, MO, MulOpc);
6635 }
6636
6637 // TODO: There are many more machine instruction opcodes to match:
6638 // 1. Other data types (integer, vectors)
6639 // 2. Other math / logic operations (xor, or)
6640 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const6641 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
6642 bool Invert) const {
6643 if (Invert)
6644 return false;
6645 switch (Inst.getOpcode()) {
6646 // == Floating-point types ==
6647 // -- Floating-point instructions --
6648 case AArch64::FADDHrr:
6649 case AArch64::FADDSrr:
6650 case AArch64::FADDDrr:
6651 case AArch64::FMULHrr:
6652 case AArch64::FMULSrr:
6653 case AArch64::FMULDrr:
6654 case AArch64::FMULX16:
6655 case AArch64::FMULX32:
6656 case AArch64::FMULX64:
6657 // -- Advanced SIMD instructions --
6658 case AArch64::FADDv4f16:
6659 case AArch64::FADDv8f16:
6660 case AArch64::FADDv2f32:
6661 case AArch64::FADDv4f32:
6662 case AArch64::FADDv2f64:
6663 case AArch64::FMULv4f16:
6664 case AArch64::FMULv8f16:
6665 case AArch64::FMULv2f32:
6666 case AArch64::FMULv4f32:
6667 case AArch64::FMULv2f64:
6668 case AArch64::FMULXv4f16:
6669 case AArch64::FMULXv8f16:
6670 case AArch64::FMULXv2f32:
6671 case AArch64::FMULXv4f32:
6672 case AArch64::FMULXv2f64:
6673 // -- SVE instructions --
6674 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6675 // in the SVE instruction set (though there are predicated ones).
6676 case AArch64::FADD_ZZZ_H:
6677 case AArch64::FADD_ZZZ_S:
6678 case AArch64::FADD_ZZZ_D:
6679 case AArch64::FMUL_ZZZ_H:
6680 case AArch64::FMUL_ZZZ_S:
6681 case AArch64::FMUL_ZZZ_D:
6682 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6683 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6684 Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6685
6686 // == Integer types ==
6687 // -- Base instructions --
6688 // Opcodes MULWrr and MULXrr don't exist because
6689 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6690 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6691 // The machine-combiner does not support three-source-operands machine
6692 // instruction. So we cannot reassociate MULs.
6693 case AArch64::ADDWrr:
6694 case AArch64::ADDXrr:
6695 case AArch64::ANDWrr:
6696 case AArch64::ANDXrr:
6697 case AArch64::ORRWrr:
6698 case AArch64::ORRXrr:
6699 case AArch64::EORWrr:
6700 case AArch64::EORXrr:
6701 case AArch64::EONWrr:
6702 case AArch64::EONXrr:
6703 // -- Advanced SIMD instructions --
6704 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6705 // in the Advanced SIMD instruction set.
6706 case AArch64::ADDv8i8:
6707 case AArch64::ADDv16i8:
6708 case AArch64::ADDv4i16:
6709 case AArch64::ADDv8i16:
6710 case AArch64::ADDv2i32:
6711 case AArch64::ADDv4i32:
6712 case AArch64::ADDv1i64:
6713 case AArch64::ADDv2i64:
6714 case AArch64::MULv8i8:
6715 case AArch64::MULv16i8:
6716 case AArch64::MULv4i16:
6717 case AArch64::MULv8i16:
6718 case AArch64::MULv2i32:
6719 case AArch64::MULv4i32:
6720 case AArch64::ANDv8i8:
6721 case AArch64::ANDv16i8:
6722 case AArch64::ORRv8i8:
6723 case AArch64::ORRv16i8:
6724 case AArch64::EORv8i8:
6725 case AArch64::EORv16i8:
6726 // -- SVE instructions --
6727 case AArch64::ADD_ZZZ_B:
6728 case AArch64::ADD_ZZZ_H:
6729 case AArch64::ADD_ZZZ_S:
6730 case AArch64::ADD_ZZZ_D:
6731 case AArch64::MUL_ZZZ_B:
6732 case AArch64::MUL_ZZZ_H:
6733 case AArch64::MUL_ZZZ_S:
6734 case AArch64::MUL_ZZZ_D:
6735 case AArch64::AND_ZZZ:
6736 case AArch64::ORR_ZZZ:
6737 case AArch64::EOR_ZZZ:
6738 return true;
6739
6740 default:
6741 return false;
6742 }
6743 }
6744
6745 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6746 static bool getMaddPatterns(MachineInstr &Root,
6747 SmallVectorImpl<unsigned> &Patterns) {
6748 unsigned Opc = Root.getOpcode();
6749 MachineBasicBlock &MBB = *Root.getParent();
6750 bool Found = false;
6751
6752 if (!isCombineInstrCandidate(Opc))
6753 return false;
6754 if (isCombineInstrSettingFlag(Opc)) {
6755 int Cmp_NZCV =
6756 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6757 // When NZCV is live bail out.
6758 if (Cmp_NZCV == -1)
6759 return false;
6760 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6761 // When opcode can't change bail out.
6762 // CHECKME: do we miss any cases for opcode conversion?
6763 if (NewOpc == Opc)
6764 return false;
6765 Opc = NewOpc;
6766 }
6767
6768 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6769 unsigned Pattern) {
6770 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6771 Patterns.push_back(Pattern);
6772 Found = true;
6773 }
6774 };
6775
6776 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6777 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6778 Patterns.push_back(Pattern);
6779 Found = true;
6780 }
6781 };
6782
6783 typedef AArch64MachineCombinerPattern MCP;
6784
6785 switch (Opc) {
6786 default:
6787 break;
6788 case AArch64::ADDWrr:
6789 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6790 "ADDWrr does not have register operands");
6791 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6792 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6793 break;
6794 case AArch64::ADDXrr:
6795 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6796 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6797 break;
6798 case AArch64::SUBWrr:
6799 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6800 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6801 break;
6802 case AArch64::SUBXrr:
6803 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6804 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6805 break;
6806 case AArch64::ADDWri:
6807 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6808 break;
6809 case AArch64::ADDXri:
6810 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6811 break;
6812 case AArch64::SUBWri:
6813 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6814 break;
6815 case AArch64::SUBXri:
6816 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6817 break;
6818 case AArch64::ADDv8i8:
6819 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6820 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6821 break;
6822 case AArch64::ADDv16i8:
6823 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6824 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6825 break;
6826 case AArch64::ADDv4i16:
6827 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6828 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6829 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6830 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6831 break;
6832 case AArch64::ADDv8i16:
6833 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6834 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6835 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6836 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6837 break;
6838 case AArch64::ADDv2i32:
6839 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6840 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6841 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6842 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6843 break;
6844 case AArch64::ADDv4i32:
6845 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6846 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6847 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6848 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6849 break;
6850 case AArch64::SUBv8i8:
6851 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6852 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6853 break;
6854 case AArch64::SUBv16i8:
6855 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6856 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6857 break;
6858 case AArch64::SUBv4i16:
6859 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6860 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6861 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6862 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6863 break;
6864 case AArch64::SUBv8i16:
6865 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6866 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6867 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6868 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6869 break;
6870 case AArch64::SUBv2i32:
6871 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6872 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6873 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6874 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6875 break;
6876 case AArch64::SUBv4i32:
6877 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6878 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6879 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6880 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6881 break;
6882 }
6883 return Found;
6884 }
6885
isAccumulationOpcode(unsigned Opcode) const6886 bool AArch64InstrInfo::isAccumulationOpcode(unsigned Opcode) const {
6887 switch (Opcode) {
6888 default:
6889 break;
6890 case AArch64::UABALB_ZZZ_D:
6891 case AArch64::UABALB_ZZZ_H:
6892 case AArch64::UABALB_ZZZ_S:
6893 case AArch64::UABALT_ZZZ_D:
6894 case AArch64::UABALT_ZZZ_H:
6895 case AArch64::UABALT_ZZZ_S:
6896 case AArch64::SABALB_ZZZ_D:
6897 case AArch64::SABALB_ZZZ_S:
6898 case AArch64::SABALB_ZZZ_H:
6899 case AArch64::SABALT_ZZZ_D:
6900 case AArch64::SABALT_ZZZ_S:
6901 case AArch64::SABALT_ZZZ_H:
6902 case AArch64::UABALv16i8_v8i16:
6903 case AArch64::UABALv2i32_v2i64:
6904 case AArch64::UABALv4i16_v4i32:
6905 case AArch64::UABALv4i32_v2i64:
6906 case AArch64::UABALv8i16_v4i32:
6907 case AArch64::UABALv8i8_v8i16:
6908 case AArch64::UABAv16i8:
6909 case AArch64::UABAv2i32:
6910 case AArch64::UABAv4i16:
6911 case AArch64::UABAv4i32:
6912 case AArch64::UABAv8i16:
6913 case AArch64::UABAv8i8:
6914 case AArch64::SABALv16i8_v8i16:
6915 case AArch64::SABALv2i32_v2i64:
6916 case AArch64::SABALv4i16_v4i32:
6917 case AArch64::SABALv4i32_v2i64:
6918 case AArch64::SABALv8i16_v4i32:
6919 case AArch64::SABALv8i8_v8i16:
6920 case AArch64::SABAv16i8:
6921 case AArch64::SABAv2i32:
6922 case AArch64::SABAv4i16:
6923 case AArch64::SABAv4i32:
6924 case AArch64::SABAv8i16:
6925 case AArch64::SABAv8i8:
6926 return true;
6927 }
6928
6929 return false;
6930 }
6931
getAccumulationStartOpcode(unsigned AccumulationOpcode) const6932 unsigned AArch64InstrInfo::getAccumulationStartOpcode(
6933 unsigned AccumulationOpcode) const {
6934 switch (AccumulationOpcode) {
6935 default:
6936 llvm_unreachable("Unsupported accumulation Opcode!");
6937 case AArch64::UABALB_ZZZ_D:
6938 return AArch64::UABDLB_ZZZ_D;
6939 case AArch64::UABALB_ZZZ_H:
6940 return AArch64::UABDLB_ZZZ_H;
6941 case AArch64::UABALB_ZZZ_S:
6942 return AArch64::UABDLB_ZZZ_S;
6943 case AArch64::UABALT_ZZZ_D:
6944 return AArch64::UABDLT_ZZZ_D;
6945 case AArch64::UABALT_ZZZ_H:
6946 return AArch64::UABDLT_ZZZ_H;
6947 case AArch64::UABALT_ZZZ_S:
6948 return AArch64::UABDLT_ZZZ_S;
6949 case AArch64::UABALv16i8_v8i16:
6950 return AArch64::UABDLv16i8_v8i16;
6951 case AArch64::UABALv2i32_v2i64:
6952 return AArch64::UABDLv2i32_v2i64;
6953 case AArch64::UABALv4i16_v4i32:
6954 return AArch64::UABDLv4i16_v4i32;
6955 case AArch64::UABALv4i32_v2i64:
6956 return AArch64::UABDLv4i32_v2i64;
6957 case AArch64::UABALv8i16_v4i32:
6958 return AArch64::UABDLv8i16_v4i32;
6959 case AArch64::UABALv8i8_v8i16:
6960 return AArch64::UABDLv8i8_v8i16;
6961 case AArch64::UABAv16i8:
6962 return AArch64::UABDv16i8;
6963 case AArch64::UABAv2i32:
6964 return AArch64::UABDv2i32;
6965 case AArch64::UABAv4i16:
6966 return AArch64::UABDv4i16;
6967 case AArch64::UABAv4i32:
6968 return AArch64::UABDv4i32;
6969 case AArch64::UABAv8i16:
6970 return AArch64::UABDv8i16;
6971 case AArch64::UABAv8i8:
6972 return AArch64::UABDv8i8;
6973 case AArch64::SABALB_ZZZ_D:
6974 return AArch64::SABDLB_ZZZ_D;
6975 case AArch64::SABALB_ZZZ_S:
6976 return AArch64::SABDLB_ZZZ_S;
6977 case AArch64::SABALB_ZZZ_H:
6978 return AArch64::SABDLB_ZZZ_H;
6979 case AArch64::SABALT_ZZZ_D:
6980 return AArch64::SABDLT_ZZZ_D;
6981 case AArch64::SABALT_ZZZ_S:
6982 return AArch64::SABDLT_ZZZ_S;
6983 case AArch64::SABALT_ZZZ_H:
6984 return AArch64::SABDLT_ZZZ_H;
6985 case AArch64::SABALv16i8_v8i16:
6986 return AArch64::SABDLv16i8_v8i16;
6987 case AArch64::SABALv2i32_v2i64:
6988 return AArch64::SABDLv2i32_v2i64;
6989 case AArch64::SABALv4i16_v4i32:
6990 return AArch64::SABDLv4i16_v4i32;
6991 case AArch64::SABALv4i32_v2i64:
6992 return AArch64::SABDLv4i32_v2i64;
6993 case AArch64::SABALv8i16_v4i32:
6994 return AArch64::SABDLv8i16_v4i32;
6995 case AArch64::SABALv8i8_v8i16:
6996 return AArch64::SABDLv8i8_v8i16;
6997 case AArch64::SABAv16i8:
6998 return AArch64::SABDv16i8;
6999 case AArch64::SABAv2i32:
7000 return AArch64::SABAv2i32;
7001 case AArch64::SABAv4i16:
7002 return AArch64::SABDv4i16;
7003 case AArch64::SABAv4i32:
7004 return AArch64::SABDv4i32;
7005 case AArch64::SABAv8i16:
7006 return AArch64::SABDv8i16;
7007 case AArch64::SABAv8i8:
7008 return AArch64::SABDv8i8;
7009 }
7010 }
7011
7012 /// Floating-Point Support
7013
7014 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7015 static bool getFMAPatterns(MachineInstr &Root,
7016 SmallVectorImpl<unsigned> &Patterns) {
7017
7018 if (!isCombineInstrCandidateFP(Root))
7019 return false;
7020
7021 MachineBasicBlock &MBB = *Root.getParent();
7022 bool Found = false;
7023
7024 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
7025 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
7026 Patterns.push_back(Pattern);
7027 return true;
7028 }
7029 return false;
7030 };
7031
7032 typedef AArch64MachineCombinerPattern MCP;
7033
7034 switch (Root.getOpcode()) {
7035 default:
7036 assert(false && "Unsupported FP instruction in combiner\n");
7037 break;
7038 case AArch64::FADDHrr:
7039 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7040 "FADDHrr does not have register operands");
7041
7042 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
7043 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
7044 break;
7045 case AArch64::FADDSrr:
7046 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
7047 "FADDSrr does not have register operands");
7048
7049 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
7050 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
7051
7052 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
7053 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
7054 break;
7055 case AArch64::FADDDrr:
7056 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
7057 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
7058
7059 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
7060 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
7061 break;
7062 case AArch64::FADDv4f16:
7063 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
7064 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
7065
7066 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
7067 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
7068 break;
7069 case AArch64::FADDv8f16:
7070 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
7071 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
7072
7073 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
7074 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
7075 break;
7076 case AArch64::FADDv2f32:
7077 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
7078 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
7079
7080 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
7081 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
7082 break;
7083 case AArch64::FADDv2f64:
7084 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
7085 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
7086
7087 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
7088 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
7089 break;
7090 case AArch64::FADDv4f32:
7091 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
7092 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
7093
7094 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
7095 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
7096 break;
7097 case AArch64::FSUBHrr:
7098 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
7099 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
7100 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
7101 break;
7102 case AArch64::FSUBSrr:
7103 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
7104
7105 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
7106 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
7107
7108 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
7109 break;
7110 case AArch64::FSUBDrr:
7111 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
7112
7113 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
7114 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
7115
7116 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
7117 break;
7118 case AArch64::FSUBv4f16:
7119 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
7120 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
7121
7122 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
7123 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
7124 break;
7125 case AArch64::FSUBv8f16:
7126 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
7127 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
7128
7129 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
7130 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
7131 break;
7132 case AArch64::FSUBv2f32:
7133 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
7134 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
7135
7136 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
7137 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
7138 break;
7139 case AArch64::FSUBv2f64:
7140 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
7141 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
7142
7143 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
7144 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
7145 break;
7146 case AArch64::FSUBv4f32:
7147 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
7148 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
7149
7150 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
7151 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
7152 break;
7153 }
7154 return Found;
7155 }
7156
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7157 static bool getFMULPatterns(MachineInstr &Root,
7158 SmallVectorImpl<unsigned> &Patterns) {
7159 MachineBasicBlock &MBB = *Root.getParent();
7160 bool Found = false;
7161
7162 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
7163 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7164 MachineOperand &MO = Root.getOperand(Operand);
7165 MachineInstr *MI = nullptr;
7166 if (MO.isReg() && MO.getReg().isVirtual())
7167 MI = MRI.getUniqueVRegDef(MO.getReg());
7168 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
7169 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
7170 MI->getOperand(1).getReg().isVirtual())
7171 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
7172 if (MI && MI->getOpcode() == Opcode) {
7173 Patterns.push_back(Pattern);
7174 return true;
7175 }
7176 return false;
7177 };
7178
7179 typedef AArch64MachineCombinerPattern MCP;
7180
7181 switch (Root.getOpcode()) {
7182 default:
7183 return false;
7184 case AArch64::FMULv2f32:
7185 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
7186 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
7187 break;
7188 case AArch64::FMULv2f64:
7189 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
7190 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
7191 break;
7192 case AArch64::FMULv4f16:
7193 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
7194 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
7195 break;
7196 case AArch64::FMULv4f32:
7197 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
7198 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
7199 break;
7200 case AArch64::FMULv8f16:
7201 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
7202 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
7203 break;
7204 }
7205
7206 return Found;
7207 }
7208
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7209 static bool getFNEGPatterns(MachineInstr &Root,
7210 SmallVectorImpl<unsigned> &Patterns) {
7211 unsigned Opc = Root.getOpcode();
7212 MachineBasicBlock &MBB = *Root.getParent();
7213 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7214
7215 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
7216 MachineOperand &MO = Root.getOperand(1);
7217 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
7218 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
7219 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
7220 Root.getFlag(MachineInstr::MIFlag::FmContract) &&
7221 Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
7222 MI->getFlag(MachineInstr::MIFlag::FmContract) &&
7223 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
7224 Patterns.push_back(Pattern);
7225 return true;
7226 }
7227 return false;
7228 };
7229
7230 switch (Opc) {
7231 default:
7232 break;
7233 case AArch64::FNEGDr:
7234 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
7235 case AArch64::FNEGSr:
7236 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
7237 }
7238
7239 return false;
7240 }
7241
7242 /// Return true when a code sequence can improve throughput. It
7243 /// should be called only for instructions in loops.
7244 /// \param Pattern - combiner pattern
isThroughputPattern(unsigned Pattern) const7245 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
7246 switch (Pattern) {
7247 default:
7248 break;
7249 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7250 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7251 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7252 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7253 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7254 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7255 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7256 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7257 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7258 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7259 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7260 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7261 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7262 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7263 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7264 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7265 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7266 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7267 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7268 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7269 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7270 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7271 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7272 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7273 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7274 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7275 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7276 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7277 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7278 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7279 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7280 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7281 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7282 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7283 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7284 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7285 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7286 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7287 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7288 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
7289 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7290 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
7291 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7292 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7293 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7294 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7295 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7296 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7297 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7298 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7299 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7300 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7301 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7302 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7303 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7304 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7305 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
7306 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7307 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
7308 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7309 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
7310 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7311 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
7312 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7313 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
7314 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7315 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7316 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7317 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7318 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7319 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7320 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7321 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7322 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7323 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7324 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7325 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7326 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7327 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7328 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7329 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7330 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7331 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7332 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7333 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7334 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7335 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7336 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7337 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7338 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7339 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7340 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7341 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7342 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7343 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7344 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7345 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7346 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7347 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7348 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7349 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7350 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7351 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7352 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7353 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7354 return true;
7355 } // end switch (Pattern)
7356 return false;
7357 }
7358
7359 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)7360 static bool getMiscPatterns(MachineInstr &Root,
7361 SmallVectorImpl<unsigned> &Patterns) {
7362 // A - (B + C) ==> (A - B) - C or (A - C) - B
7363 unsigned Opc = Root.getOpcode();
7364 MachineBasicBlock &MBB = *Root.getParent();
7365
7366 switch (Opc) {
7367 case AArch64::SUBWrr:
7368 case AArch64::SUBSWrr:
7369 case AArch64::SUBXrr:
7370 case AArch64::SUBSXrr:
7371 // Found candidate root.
7372 break;
7373 default:
7374 return false;
7375 }
7376
7377 if (isCombineInstrSettingFlag(Opc) &&
7378 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
7379 -1)
7380 return false;
7381
7382 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
7383 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
7384 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
7385 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
7386 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
7387 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
7388 return true;
7389 }
7390
7391 return false;
7392 }
7393
7394 CombinerObjective
getCombinerObjective(unsigned Pattern) const7395 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
7396 switch (Pattern) {
7397 case AArch64MachineCombinerPattern::SUBADD_OP1:
7398 case AArch64MachineCombinerPattern::SUBADD_OP2:
7399 return CombinerObjective::MustReduceDepth;
7400 default:
7401 return TargetInstrInfo::getCombinerObjective(Pattern);
7402 }
7403 }
7404
7405 /// Return true when there is potentially a faster code sequence for an
7406 /// instruction chain ending in \p Root. All potential patterns are listed in
7407 /// the \p Pattern vector. Pattern should be sorted in priority order since the
7408 /// pattern evaluator stops checking as soon as it finds a faster sequence.
7409
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns,bool DoRegPressureReduce) const7410 bool AArch64InstrInfo::getMachineCombinerPatterns(
7411 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
7412 bool DoRegPressureReduce) const {
7413 // Integer patterns
7414 if (getMaddPatterns(Root, Patterns))
7415 return true;
7416 // Floating point patterns
7417 if (getFMULPatterns(Root, Patterns))
7418 return true;
7419 if (getFMAPatterns(Root, Patterns))
7420 return true;
7421 if (getFNEGPatterns(Root, Patterns))
7422 return true;
7423
7424 // Other patterns
7425 if (getMiscPatterns(Root, Patterns))
7426 return true;
7427
7428 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
7429 DoRegPressureReduce);
7430 }
7431
7432 enum class FMAInstKind { Default, Indexed, Accumulator };
7433 /// genFusedMultiply - Generate fused multiply instructions.
7434 /// This function supports both integer and floating point instructions.
7435 /// A typical example:
7436 /// F|MUL I=A,B,0
7437 /// F|ADD R,I,C
7438 /// ==> F|MADD R,A,B,C
7439 /// \param MF Containing MachineFunction
7440 /// \param MRI Register information
7441 /// \param TII Target information
7442 /// \param Root is the F|ADD instruction
7443 /// \param [out] InsInstrs is a vector of machine instructions and will
7444 /// contain the generated madd instruction
7445 /// \param IdxMulOpd is index of operand in Root that is the result of
7446 /// the F|MUL. In the example above IdxMulOpd is 1.
7447 /// \param MaddOpc the opcode fo the f|madd instruction
7448 /// \param RC Register class of operands
7449 /// \param kind of fma instruction (addressing mode) to be generated
7450 /// \param ReplacedAddend is the result register from the instruction
7451 /// replacing the non-combined operand, if any.
7452 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)7453 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
7454 const TargetInstrInfo *TII, MachineInstr &Root,
7455 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
7456 unsigned MaddOpc, const TargetRegisterClass *RC,
7457 FMAInstKind kind = FMAInstKind::Default,
7458 const Register *ReplacedAddend = nullptr) {
7459 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7460
7461 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
7462 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7463 Register ResultReg = Root.getOperand(0).getReg();
7464 Register SrcReg0 = MUL->getOperand(1).getReg();
7465 bool Src0IsKill = MUL->getOperand(1).isKill();
7466 Register SrcReg1 = MUL->getOperand(2).getReg();
7467 bool Src1IsKill = MUL->getOperand(2).isKill();
7468
7469 Register SrcReg2;
7470 bool Src2IsKill;
7471 if (ReplacedAddend) {
7472 // If we just generated a new addend, we must be it's only use.
7473 SrcReg2 = *ReplacedAddend;
7474 Src2IsKill = true;
7475 } else {
7476 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
7477 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
7478 }
7479
7480 if (ResultReg.isVirtual())
7481 MRI.constrainRegClass(ResultReg, RC);
7482 if (SrcReg0.isVirtual())
7483 MRI.constrainRegClass(SrcReg0, RC);
7484 if (SrcReg1.isVirtual())
7485 MRI.constrainRegClass(SrcReg1, RC);
7486 if (SrcReg2.isVirtual())
7487 MRI.constrainRegClass(SrcReg2, RC);
7488
7489 MachineInstrBuilder MIB;
7490 if (kind == FMAInstKind::Default)
7491 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7492 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7493 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7494 .addReg(SrcReg2, getKillRegState(Src2IsKill));
7495 else if (kind == FMAInstKind::Indexed)
7496 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7497 .addReg(SrcReg2, getKillRegState(Src2IsKill))
7498 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7499 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7500 .addImm(MUL->getOperand(3).getImm());
7501 else if (kind == FMAInstKind::Accumulator)
7502 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7503 .addReg(SrcReg2, getKillRegState(Src2IsKill))
7504 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7505 .addReg(SrcReg1, getKillRegState(Src1IsKill));
7506 else
7507 assert(false && "Invalid FMA instruction kind \n");
7508 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
7509 InsInstrs.push_back(MIB);
7510 return MUL;
7511 }
7512
7513 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)7514 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
7515 const TargetInstrInfo *TII, MachineInstr &Root,
7516 SmallVectorImpl<MachineInstr *> &InsInstrs) {
7517 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
7518
7519 unsigned Opc = 0;
7520 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
7521 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
7522 Opc = AArch64::FNMADDSrrr;
7523 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
7524 Opc = AArch64::FNMADDDrrr;
7525 else
7526 return nullptr;
7527
7528 Register ResultReg = Root.getOperand(0).getReg();
7529 Register SrcReg0 = MAD->getOperand(1).getReg();
7530 Register SrcReg1 = MAD->getOperand(2).getReg();
7531 Register SrcReg2 = MAD->getOperand(3).getReg();
7532 bool Src0IsKill = MAD->getOperand(1).isKill();
7533 bool Src1IsKill = MAD->getOperand(2).isKill();
7534 bool Src2IsKill = MAD->getOperand(3).isKill();
7535 if (ResultReg.isVirtual())
7536 MRI.constrainRegClass(ResultReg, RC);
7537 if (SrcReg0.isVirtual())
7538 MRI.constrainRegClass(SrcReg0, RC);
7539 if (SrcReg1.isVirtual())
7540 MRI.constrainRegClass(SrcReg1, RC);
7541 if (SrcReg2.isVirtual())
7542 MRI.constrainRegClass(SrcReg2, RC);
7543
7544 MachineInstrBuilder MIB =
7545 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
7546 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7547 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7548 .addReg(SrcReg2, getKillRegState(Src2IsKill));
7549 InsInstrs.push_back(MIB);
7550
7551 return MAD;
7552 }
7553
7554 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
7555 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)7556 genIndexedMultiply(MachineInstr &Root,
7557 SmallVectorImpl<MachineInstr *> &InsInstrs,
7558 unsigned IdxDupOp, unsigned MulOpc,
7559 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
7560 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
7561 "Invalid index of FMUL operand");
7562
7563 MachineFunction &MF = *Root.getMF();
7564 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7565
7566 MachineInstr *Dup =
7567 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
7568
7569 if (Dup->getOpcode() == TargetOpcode::COPY)
7570 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
7571
7572 Register DupSrcReg = Dup->getOperand(1).getReg();
7573 MRI.clearKillFlags(DupSrcReg);
7574 MRI.constrainRegClass(DupSrcReg, RC);
7575
7576 unsigned DupSrcLane = Dup->getOperand(2).getImm();
7577
7578 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
7579 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
7580
7581 Register ResultReg = Root.getOperand(0).getReg();
7582
7583 MachineInstrBuilder MIB;
7584 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
7585 .add(MulOp)
7586 .addReg(DupSrcReg)
7587 .addImm(DupSrcLane);
7588
7589 InsInstrs.push_back(MIB);
7590 return &Root;
7591 }
7592
7593 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
7594 /// instructions.
7595 ///
7596 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)7597 static MachineInstr *genFusedMultiplyAcc(
7598 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7599 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7600 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7601 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7602 FMAInstKind::Accumulator);
7603 }
7604
7605 /// genNeg - Helper to generate an intermediate negation of the second operand
7606 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)7607 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
7608 const TargetInstrInfo *TII, MachineInstr &Root,
7609 SmallVectorImpl<MachineInstr *> &InsInstrs,
7610 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
7611 unsigned MnegOpc, const TargetRegisterClass *RC) {
7612 Register NewVR = MRI.createVirtualRegister(RC);
7613 MachineInstrBuilder MIB =
7614 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
7615 .add(Root.getOperand(2));
7616 InsInstrs.push_back(MIB);
7617
7618 assert(InstrIdxForVirtReg.empty());
7619 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7620
7621 return NewVR;
7622 }
7623
7624 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7625 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)7626 static MachineInstr *genFusedMultiplyAccNeg(
7627 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7628 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7629 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7630 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7631 assert(IdxMulOpd == 1);
7632
7633 Register NewVR =
7634 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7635 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7636 FMAInstKind::Accumulator, &NewVR);
7637 }
7638
7639 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
7640 /// instructions.
7641 ///
7642 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)7643 static MachineInstr *genFusedMultiplyIdx(
7644 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7645 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7646 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
7647 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7648 FMAInstKind::Indexed);
7649 }
7650
7651 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
7652 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)7653 static MachineInstr *genFusedMultiplyIdxNeg(
7654 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
7655 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
7656 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
7657 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
7658 assert(IdxMulOpd == 1);
7659
7660 Register NewVR =
7661 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
7662
7663 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
7664 FMAInstKind::Indexed, &NewVR);
7665 }
7666
7667 /// genMaddR - Generate madd instruction and combine mul and add using
7668 /// an extra virtual register
7669 /// Example - an ADD intermediate needs to be stored in a register:
7670 /// MUL I=A,B,0
7671 /// ADD R,I,Imm
7672 /// ==> ORR V, ZR, Imm
7673 /// ==> MADD R,A,B,V
7674 /// \param MF Containing MachineFunction
7675 /// \param MRI Register information
7676 /// \param TII Target information
7677 /// \param Root is the ADD instruction
7678 /// \param [out] InsInstrs is a vector of machine instructions and will
7679 /// contain the generated madd instruction
7680 /// \param IdxMulOpd is index of operand in Root that is the result of
7681 /// the MUL. In the example above IdxMulOpd is 1.
7682 /// \param MaddOpc the opcode fo the madd instruction
7683 /// \param VR is a virtual register that holds the value of an ADD operand
7684 /// (V in the example above).
7685 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)7686 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
7687 const TargetInstrInfo *TII, MachineInstr &Root,
7688 SmallVectorImpl<MachineInstr *> &InsInstrs,
7689 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
7690 const TargetRegisterClass *RC) {
7691 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
7692
7693 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
7694 Register ResultReg = Root.getOperand(0).getReg();
7695 Register SrcReg0 = MUL->getOperand(1).getReg();
7696 bool Src0IsKill = MUL->getOperand(1).isKill();
7697 Register SrcReg1 = MUL->getOperand(2).getReg();
7698 bool Src1IsKill = MUL->getOperand(2).isKill();
7699
7700 if (ResultReg.isVirtual())
7701 MRI.constrainRegClass(ResultReg, RC);
7702 if (SrcReg0.isVirtual())
7703 MRI.constrainRegClass(SrcReg0, RC);
7704 if (SrcReg1.isVirtual())
7705 MRI.constrainRegClass(SrcReg1, RC);
7706 if (Register::isVirtualRegister(VR))
7707 MRI.constrainRegClass(VR, RC);
7708
7709 MachineInstrBuilder MIB =
7710 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
7711 .addReg(SrcReg0, getKillRegState(Src0IsKill))
7712 .addReg(SrcReg1, getKillRegState(Src1IsKill))
7713 .addReg(VR);
7714 // Insert the MADD
7715 InsInstrs.push_back(MIB);
7716 return MUL;
7717 }
7718
7719 /// Do the following transformation
7720 /// A - (B + C) ==> (A - B) - C
7721 /// A - (B + C) ==> (A - C) - B
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<Register,unsigned> & InstrIdxForVirtReg)7722 static void genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
7723 const TargetInstrInfo *TII, MachineInstr &Root,
7724 SmallVectorImpl<MachineInstr *> &InsInstrs,
7725 SmallVectorImpl<MachineInstr *> &DelInstrs,
7726 unsigned IdxOpd1,
7727 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
7728 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
7729 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
7730 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
7731
7732 Register ResultReg = Root.getOperand(0).getReg();
7733 Register RegA = Root.getOperand(1).getReg();
7734 bool RegAIsKill = Root.getOperand(1).isKill();
7735 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
7736 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
7737 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
7738 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
7739 Register NewVR =
7740 MRI.createVirtualRegister(MRI.getRegClass(Root.getOperand(2).getReg()));
7741
7742 unsigned Opcode = Root.getOpcode();
7743 if (Opcode == AArch64::SUBSWrr)
7744 Opcode = AArch64::SUBWrr;
7745 else if (Opcode == AArch64::SUBSXrr)
7746 Opcode = AArch64::SUBXrr;
7747 else
7748 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
7749 "Unexpected instruction opcode.");
7750
7751 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
7752 Flags &= ~MachineInstr::NoSWrap;
7753 Flags &= ~MachineInstr::NoUWrap;
7754
7755 MachineInstrBuilder MIB1 =
7756 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
7757 .addReg(RegA, getKillRegState(RegAIsKill))
7758 .addReg(RegB, getKillRegState(RegBIsKill))
7759 .setMIFlags(Flags);
7760 MachineInstrBuilder MIB2 =
7761 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
7762 .addReg(NewVR, getKillRegState(true))
7763 .addReg(RegC, getKillRegState(RegCIsKill))
7764 .setMIFlags(Flags);
7765
7766 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7767 InsInstrs.push_back(MIB1);
7768 InsInstrs.push_back(MIB2);
7769 DelInstrs.push_back(AddMI);
7770 DelInstrs.push_back(&Root);
7771 }
7772
getReduceOpcodeForAccumulator(unsigned int AccumulatorOpCode) const7773 unsigned AArch64InstrInfo::getReduceOpcodeForAccumulator(
7774 unsigned int AccumulatorOpCode) const {
7775 switch (AccumulatorOpCode) {
7776 case AArch64::UABALB_ZZZ_D:
7777 case AArch64::SABALB_ZZZ_D:
7778 case AArch64::UABALT_ZZZ_D:
7779 case AArch64::SABALT_ZZZ_D:
7780 return AArch64::ADD_ZZZ_D;
7781 case AArch64::UABALB_ZZZ_H:
7782 case AArch64::SABALB_ZZZ_H:
7783 case AArch64::UABALT_ZZZ_H:
7784 case AArch64::SABALT_ZZZ_H:
7785 return AArch64::ADD_ZZZ_H;
7786 case AArch64::UABALB_ZZZ_S:
7787 case AArch64::SABALB_ZZZ_S:
7788 case AArch64::UABALT_ZZZ_S:
7789 case AArch64::SABALT_ZZZ_S:
7790 return AArch64::ADD_ZZZ_S;
7791 case AArch64::UABALv16i8_v8i16:
7792 case AArch64::SABALv8i8_v8i16:
7793 case AArch64::SABAv8i16:
7794 case AArch64::UABAv8i16:
7795 return AArch64::ADDv8i16;
7796 case AArch64::SABALv2i32_v2i64:
7797 case AArch64::UABALv2i32_v2i64:
7798 case AArch64::SABALv4i32_v2i64:
7799 return AArch64::ADDv2i64;
7800 case AArch64::UABALv4i16_v4i32:
7801 case AArch64::SABALv4i16_v4i32:
7802 case AArch64::SABALv8i16_v4i32:
7803 case AArch64::SABAv4i32:
7804 case AArch64::UABAv4i32:
7805 return AArch64::ADDv4i32;
7806 case AArch64::UABALv4i32_v2i64:
7807 return AArch64::ADDv2i64;
7808 case AArch64::UABALv8i16_v4i32:
7809 return AArch64::ADDv4i32;
7810 case AArch64::UABALv8i8_v8i16:
7811 case AArch64::SABALv16i8_v8i16:
7812 return AArch64::ADDv8i16;
7813 case AArch64::UABAv16i8:
7814 case AArch64::SABAv16i8:
7815 return AArch64::ADDv16i8;
7816 case AArch64::UABAv4i16:
7817 case AArch64::SABAv4i16:
7818 return AArch64::ADDv4i16;
7819 case AArch64::UABAv2i32:
7820 case AArch64::SABAv2i32:
7821 return AArch64::ADDv2i32;
7822 case AArch64::UABAv8i8:
7823 case AArch64::SABAv8i8:
7824 return AArch64::ADDv8i8;
7825 default:
7826 llvm_unreachable("Unknown accumulator opcode");
7827 }
7828 }
7829
7830 /// When getMachineCombinerPatterns() finds potential patterns,
7831 /// this function generates the instructions that could replace the
7832 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,unsigned Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<Register,unsigned> & InstrIdxForVirtReg) const7833 void AArch64InstrInfo::genAlternativeCodeSequence(
7834 MachineInstr &Root, unsigned Pattern,
7835 SmallVectorImpl<MachineInstr *> &InsInstrs,
7836 SmallVectorImpl<MachineInstr *> &DelInstrs,
7837 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
7838 MachineBasicBlock &MBB = *Root.getParent();
7839 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7840 MachineFunction &MF = *MBB.getParent();
7841 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
7842
7843 MachineInstr *MUL = nullptr;
7844 const TargetRegisterClass *RC;
7845 unsigned Opc;
7846 switch (Pattern) {
7847 default:
7848 // Reassociate instructions.
7849 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
7850 DelInstrs, InstrIdxForVirtReg);
7851 return;
7852 case AArch64MachineCombinerPattern::SUBADD_OP1:
7853 // A - (B + C)
7854 // ==> (A - B) - C
7855 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7856 InstrIdxForVirtReg);
7857 return;
7858 case AArch64MachineCombinerPattern::SUBADD_OP2:
7859 // A - (B + C)
7860 // ==> (A - C) - B
7861 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7862 InstrIdxForVirtReg);
7863 return;
7864 case AArch64MachineCombinerPattern::MULADDW_OP1:
7865 case AArch64MachineCombinerPattern::MULADDX_OP1:
7866 // MUL I=A,B,0
7867 // ADD R,I,C
7868 // ==> MADD R,A,B,C
7869 // --- Create(MADD);
7870 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7871 Opc = AArch64::MADDWrrr;
7872 RC = &AArch64::GPR32RegClass;
7873 } else {
7874 Opc = AArch64::MADDXrrr;
7875 RC = &AArch64::GPR64RegClass;
7876 }
7877 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7878 break;
7879 case AArch64MachineCombinerPattern::MULADDW_OP2:
7880 case AArch64MachineCombinerPattern::MULADDX_OP2:
7881 // MUL I=A,B,0
7882 // ADD R,C,I
7883 // ==> MADD R,A,B,C
7884 // --- Create(MADD);
7885 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7886 Opc = AArch64::MADDWrrr;
7887 RC = &AArch64::GPR32RegClass;
7888 } else {
7889 Opc = AArch64::MADDXrrr;
7890 RC = &AArch64::GPR64RegClass;
7891 }
7892 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7893 break;
7894 case AArch64MachineCombinerPattern::MULADDWI_OP1:
7895 case AArch64MachineCombinerPattern::MULADDXI_OP1:
7896 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7897 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7898 // MUL I=A,B,0
7899 // ADD/SUB R,I,Imm
7900 // ==> MOV V, Imm/-Imm
7901 // ==> MADD R,A,B,V
7902 // --- Create(MADD);
7903 const TargetRegisterClass *RC;
7904 unsigned BitSize, MovImm;
7905 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
7906 Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7907 MovImm = AArch64::MOVi32imm;
7908 RC = &AArch64::GPR32spRegClass;
7909 BitSize = 32;
7910 Opc = AArch64::MADDWrrr;
7911 RC = &AArch64::GPR32RegClass;
7912 } else {
7913 MovImm = AArch64::MOVi64imm;
7914 RC = &AArch64::GPR64spRegClass;
7915 BitSize = 64;
7916 Opc = AArch64::MADDXrrr;
7917 RC = &AArch64::GPR64RegClass;
7918 }
7919 Register NewVR = MRI.createVirtualRegister(RC);
7920 uint64_t Imm = Root.getOperand(2).getImm();
7921
7922 if (Root.getOperand(3).isImm()) {
7923 unsigned Val = Root.getOperand(3).getImm();
7924 Imm = Imm << Val;
7925 }
7926 bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
7927 Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
7928 uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
7929 // Check that the immediate can be composed via a single instruction.
7930 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7931 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7932 if (Insn.size() != 1)
7933 return;
7934 MachineInstrBuilder MIB1 =
7935 BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR)
7936 .addImm(IsSub ? -Imm : Imm);
7937 InsInstrs.push_back(MIB1);
7938 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7939 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7940 break;
7941 }
7942 case AArch64MachineCombinerPattern::MULSUBW_OP1:
7943 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7944 // MUL I=A,B,0
7945 // SUB R,I, C
7946 // ==> SUB V, 0, C
7947 // ==> MADD R,A,B,V // = -C + A*B
7948 // --- Create(MADD);
7949 const TargetRegisterClass *SubRC;
7950 unsigned SubOpc, ZeroReg;
7951 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7952 SubOpc = AArch64::SUBWrr;
7953 SubRC = &AArch64::GPR32spRegClass;
7954 ZeroReg = AArch64::WZR;
7955 Opc = AArch64::MADDWrrr;
7956 RC = &AArch64::GPR32RegClass;
7957 } else {
7958 SubOpc = AArch64::SUBXrr;
7959 SubRC = &AArch64::GPR64spRegClass;
7960 ZeroReg = AArch64::XZR;
7961 Opc = AArch64::MADDXrrr;
7962 RC = &AArch64::GPR64RegClass;
7963 }
7964 Register NewVR = MRI.createVirtualRegister(SubRC);
7965 // SUB NewVR, 0, C
7966 MachineInstrBuilder MIB1 =
7967 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7968 .addReg(ZeroReg)
7969 .add(Root.getOperand(2));
7970 InsInstrs.push_back(MIB1);
7971 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7972 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7973 break;
7974 }
7975 case AArch64MachineCombinerPattern::MULSUBW_OP2:
7976 case AArch64MachineCombinerPattern::MULSUBX_OP2:
7977 // MUL I=A,B,0
7978 // SUB R,C,I
7979 // ==> MSUB R,A,B,C (computes C - A*B)
7980 // --- Create(MSUB);
7981 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7982 Opc = AArch64::MSUBWrrr;
7983 RC = &AArch64::GPR32RegClass;
7984 } else {
7985 Opc = AArch64::MSUBXrrr;
7986 RC = &AArch64::GPR64RegClass;
7987 }
7988 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7989 break;
7990 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7991 Opc = AArch64::MLAv8i8;
7992 RC = &AArch64::FPR64RegClass;
7993 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7994 break;
7995 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7996 Opc = AArch64::MLAv8i8;
7997 RC = &AArch64::FPR64RegClass;
7998 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7999 break;
8000 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
8001 Opc = AArch64::MLAv16i8;
8002 RC = &AArch64::FPR128RegClass;
8003 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8004 break;
8005 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
8006 Opc = AArch64::MLAv16i8;
8007 RC = &AArch64::FPR128RegClass;
8008 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8009 break;
8010 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
8011 Opc = AArch64::MLAv4i16;
8012 RC = &AArch64::FPR64RegClass;
8013 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8014 break;
8015 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
8016 Opc = AArch64::MLAv4i16;
8017 RC = &AArch64::FPR64RegClass;
8018 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8019 break;
8020 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
8021 Opc = AArch64::MLAv8i16;
8022 RC = &AArch64::FPR128RegClass;
8023 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8024 break;
8025 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
8026 Opc = AArch64::MLAv8i16;
8027 RC = &AArch64::FPR128RegClass;
8028 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8029 break;
8030 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
8031 Opc = AArch64::MLAv2i32;
8032 RC = &AArch64::FPR64RegClass;
8033 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8034 break;
8035 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
8036 Opc = AArch64::MLAv2i32;
8037 RC = &AArch64::FPR64RegClass;
8038 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8039 break;
8040 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
8041 Opc = AArch64::MLAv4i32;
8042 RC = &AArch64::FPR128RegClass;
8043 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8044 break;
8045 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
8046 Opc = AArch64::MLAv4i32;
8047 RC = &AArch64::FPR128RegClass;
8048 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8049 break;
8050
8051 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
8052 Opc = AArch64::MLAv8i8;
8053 RC = &AArch64::FPR64RegClass;
8054 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8055 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
8056 RC);
8057 break;
8058 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
8059 Opc = AArch64::MLSv8i8;
8060 RC = &AArch64::FPR64RegClass;
8061 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8062 break;
8063 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
8064 Opc = AArch64::MLAv16i8;
8065 RC = &AArch64::FPR128RegClass;
8066 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8067 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
8068 RC);
8069 break;
8070 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
8071 Opc = AArch64::MLSv16i8;
8072 RC = &AArch64::FPR128RegClass;
8073 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8074 break;
8075 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
8076 Opc = AArch64::MLAv4i16;
8077 RC = &AArch64::FPR64RegClass;
8078 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8079 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8080 RC);
8081 break;
8082 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
8083 Opc = AArch64::MLSv4i16;
8084 RC = &AArch64::FPR64RegClass;
8085 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8086 break;
8087 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
8088 Opc = AArch64::MLAv8i16;
8089 RC = &AArch64::FPR128RegClass;
8090 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8091 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8092 RC);
8093 break;
8094 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
8095 Opc = AArch64::MLSv8i16;
8096 RC = &AArch64::FPR128RegClass;
8097 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8098 break;
8099 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
8100 Opc = AArch64::MLAv2i32;
8101 RC = &AArch64::FPR64RegClass;
8102 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8103 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8104 RC);
8105 break;
8106 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
8107 Opc = AArch64::MLSv2i32;
8108 RC = &AArch64::FPR64RegClass;
8109 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8110 break;
8111 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
8112 Opc = AArch64::MLAv4i32;
8113 RC = &AArch64::FPR128RegClass;
8114 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
8115 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8116 RC);
8117 break;
8118 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
8119 Opc = AArch64::MLSv4i32;
8120 RC = &AArch64::FPR128RegClass;
8121 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8122 break;
8123
8124 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
8125 Opc = AArch64::MLAv4i16_indexed;
8126 RC = &AArch64::FPR64RegClass;
8127 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8128 break;
8129 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
8130 Opc = AArch64::MLAv4i16_indexed;
8131 RC = &AArch64::FPR64RegClass;
8132 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8133 break;
8134 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
8135 Opc = AArch64::MLAv8i16_indexed;
8136 RC = &AArch64::FPR128RegClass;
8137 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8138 break;
8139 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
8140 Opc = AArch64::MLAv8i16_indexed;
8141 RC = &AArch64::FPR128RegClass;
8142 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8143 break;
8144 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
8145 Opc = AArch64::MLAv2i32_indexed;
8146 RC = &AArch64::FPR64RegClass;
8147 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8148 break;
8149 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
8150 Opc = AArch64::MLAv2i32_indexed;
8151 RC = &AArch64::FPR64RegClass;
8152 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8153 break;
8154 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
8155 Opc = AArch64::MLAv4i32_indexed;
8156 RC = &AArch64::FPR128RegClass;
8157 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8158 break;
8159 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
8160 Opc = AArch64::MLAv4i32_indexed;
8161 RC = &AArch64::FPR128RegClass;
8162 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8163 break;
8164
8165 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
8166 Opc = AArch64::MLAv4i16_indexed;
8167 RC = &AArch64::FPR64RegClass;
8168 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8169 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
8170 RC);
8171 break;
8172 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
8173 Opc = AArch64::MLSv4i16_indexed;
8174 RC = &AArch64::FPR64RegClass;
8175 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8176 break;
8177 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
8178 Opc = AArch64::MLAv8i16_indexed;
8179 RC = &AArch64::FPR128RegClass;
8180 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8181 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
8182 RC);
8183 break;
8184 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
8185 Opc = AArch64::MLSv8i16_indexed;
8186 RC = &AArch64::FPR128RegClass;
8187 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8188 break;
8189 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
8190 Opc = AArch64::MLAv2i32_indexed;
8191 RC = &AArch64::FPR64RegClass;
8192 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8193 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
8194 RC);
8195 break;
8196 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
8197 Opc = AArch64::MLSv2i32_indexed;
8198 RC = &AArch64::FPR64RegClass;
8199 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8200 break;
8201 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
8202 Opc = AArch64::MLAv4i32_indexed;
8203 RC = &AArch64::FPR128RegClass;
8204 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
8205 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
8206 RC);
8207 break;
8208 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
8209 Opc = AArch64::MLSv4i32_indexed;
8210 RC = &AArch64::FPR128RegClass;
8211 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8212 break;
8213
8214 // Floating Point Support
8215 case AArch64MachineCombinerPattern::FMULADDH_OP1:
8216 Opc = AArch64::FMADDHrrr;
8217 RC = &AArch64::FPR16RegClass;
8218 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8219 break;
8220 case AArch64MachineCombinerPattern::FMULADDS_OP1:
8221 Opc = AArch64::FMADDSrrr;
8222 RC = &AArch64::FPR32RegClass;
8223 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8224 break;
8225 case AArch64MachineCombinerPattern::FMULADDD_OP1:
8226 Opc = AArch64::FMADDDrrr;
8227 RC = &AArch64::FPR64RegClass;
8228 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8229 break;
8230
8231 case AArch64MachineCombinerPattern::FMULADDH_OP2:
8232 Opc = AArch64::FMADDHrrr;
8233 RC = &AArch64::FPR16RegClass;
8234 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8235 break;
8236 case AArch64MachineCombinerPattern::FMULADDS_OP2:
8237 Opc = AArch64::FMADDSrrr;
8238 RC = &AArch64::FPR32RegClass;
8239 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8240 break;
8241 case AArch64MachineCombinerPattern::FMULADDD_OP2:
8242 Opc = AArch64::FMADDDrrr;
8243 RC = &AArch64::FPR64RegClass;
8244 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8245 break;
8246
8247 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
8248 Opc = AArch64::FMLAv1i32_indexed;
8249 RC = &AArch64::FPR32RegClass;
8250 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8251 FMAInstKind::Indexed);
8252 break;
8253 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
8254 Opc = AArch64::FMLAv1i32_indexed;
8255 RC = &AArch64::FPR32RegClass;
8256 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8257 FMAInstKind::Indexed);
8258 break;
8259
8260 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
8261 Opc = AArch64::FMLAv1i64_indexed;
8262 RC = &AArch64::FPR64RegClass;
8263 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8264 FMAInstKind::Indexed);
8265 break;
8266 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
8267 Opc = AArch64::FMLAv1i64_indexed;
8268 RC = &AArch64::FPR64RegClass;
8269 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8270 FMAInstKind::Indexed);
8271 break;
8272
8273 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
8274 RC = &AArch64::FPR64RegClass;
8275 Opc = AArch64::FMLAv4i16_indexed;
8276 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8277 FMAInstKind::Indexed);
8278 break;
8279 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
8280 RC = &AArch64::FPR64RegClass;
8281 Opc = AArch64::FMLAv4f16;
8282 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8283 FMAInstKind::Accumulator);
8284 break;
8285 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
8286 RC = &AArch64::FPR64RegClass;
8287 Opc = AArch64::FMLAv4i16_indexed;
8288 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8289 FMAInstKind::Indexed);
8290 break;
8291 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
8292 RC = &AArch64::FPR64RegClass;
8293 Opc = AArch64::FMLAv4f16;
8294 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8295 FMAInstKind::Accumulator);
8296 break;
8297
8298 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
8299 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
8300 RC = &AArch64::FPR64RegClass;
8301 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
8302 Opc = AArch64::FMLAv2i32_indexed;
8303 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8304 FMAInstKind::Indexed);
8305 } else {
8306 Opc = AArch64::FMLAv2f32;
8307 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8308 FMAInstKind::Accumulator);
8309 }
8310 break;
8311 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
8312 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
8313 RC = &AArch64::FPR64RegClass;
8314 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
8315 Opc = AArch64::FMLAv2i32_indexed;
8316 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8317 FMAInstKind::Indexed);
8318 } else {
8319 Opc = AArch64::FMLAv2f32;
8320 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8321 FMAInstKind::Accumulator);
8322 }
8323 break;
8324
8325 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
8326 RC = &AArch64::FPR128RegClass;
8327 Opc = AArch64::FMLAv8i16_indexed;
8328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8329 FMAInstKind::Indexed);
8330 break;
8331 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
8332 RC = &AArch64::FPR128RegClass;
8333 Opc = AArch64::FMLAv8f16;
8334 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8335 FMAInstKind::Accumulator);
8336 break;
8337 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
8338 RC = &AArch64::FPR128RegClass;
8339 Opc = AArch64::FMLAv8i16_indexed;
8340 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8341 FMAInstKind::Indexed);
8342 break;
8343 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
8344 RC = &AArch64::FPR128RegClass;
8345 Opc = AArch64::FMLAv8f16;
8346 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8347 FMAInstKind::Accumulator);
8348 break;
8349
8350 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
8351 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
8352 RC = &AArch64::FPR128RegClass;
8353 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
8354 Opc = AArch64::FMLAv2i64_indexed;
8355 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8356 FMAInstKind::Indexed);
8357 } else {
8358 Opc = AArch64::FMLAv2f64;
8359 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8360 FMAInstKind::Accumulator);
8361 }
8362 break;
8363 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
8364 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
8365 RC = &AArch64::FPR128RegClass;
8366 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
8367 Opc = AArch64::FMLAv2i64_indexed;
8368 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8369 FMAInstKind::Indexed);
8370 } else {
8371 Opc = AArch64::FMLAv2f64;
8372 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8373 FMAInstKind::Accumulator);
8374 }
8375 break;
8376
8377 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
8378 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
8379 RC = &AArch64::FPR128RegClass;
8380 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
8381 Opc = AArch64::FMLAv4i32_indexed;
8382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8383 FMAInstKind::Indexed);
8384 } else {
8385 Opc = AArch64::FMLAv4f32;
8386 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8387 FMAInstKind::Accumulator);
8388 }
8389 break;
8390
8391 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
8392 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
8393 RC = &AArch64::FPR128RegClass;
8394 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
8395 Opc = AArch64::FMLAv4i32_indexed;
8396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8397 FMAInstKind::Indexed);
8398 } else {
8399 Opc = AArch64::FMLAv4f32;
8400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8401 FMAInstKind::Accumulator);
8402 }
8403 break;
8404
8405 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
8406 Opc = AArch64::FNMSUBHrrr;
8407 RC = &AArch64::FPR16RegClass;
8408 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8409 break;
8410 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
8411 Opc = AArch64::FNMSUBSrrr;
8412 RC = &AArch64::FPR32RegClass;
8413 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8414 break;
8415 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
8416 Opc = AArch64::FNMSUBDrrr;
8417 RC = &AArch64::FPR64RegClass;
8418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8419 break;
8420
8421 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
8422 Opc = AArch64::FNMADDHrrr;
8423 RC = &AArch64::FPR16RegClass;
8424 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8425 break;
8426 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
8427 Opc = AArch64::FNMADDSrrr;
8428 RC = &AArch64::FPR32RegClass;
8429 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8430 break;
8431 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
8432 Opc = AArch64::FNMADDDrrr;
8433 RC = &AArch64::FPR64RegClass;
8434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
8435 break;
8436
8437 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
8438 Opc = AArch64::FMSUBHrrr;
8439 RC = &AArch64::FPR16RegClass;
8440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8441 break;
8442 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
8443 Opc = AArch64::FMSUBSrrr;
8444 RC = &AArch64::FPR32RegClass;
8445 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8446 break;
8447 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
8448 Opc = AArch64::FMSUBDrrr;
8449 RC = &AArch64::FPR64RegClass;
8450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
8451 break;
8452
8453 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
8454 Opc = AArch64::FMLSv1i32_indexed;
8455 RC = &AArch64::FPR32RegClass;
8456 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8457 FMAInstKind::Indexed);
8458 break;
8459
8460 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
8461 Opc = AArch64::FMLSv1i64_indexed;
8462 RC = &AArch64::FPR64RegClass;
8463 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8464 FMAInstKind::Indexed);
8465 break;
8466
8467 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
8468 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
8469 RC = &AArch64::FPR64RegClass;
8470 Register NewVR = MRI.createVirtualRegister(RC);
8471 MachineInstrBuilder MIB1 =
8472 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
8473 .add(Root.getOperand(2));
8474 InsInstrs.push_back(MIB1);
8475 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8476 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
8477 Opc = AArch64::FMLAv4f16;
8478 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8479 FMAInstKind::Accumulator, &NewVR);
8480 } else {
8481 Opc = AArch64::FMLAv4i16_indexed;
8482 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8483 FMAInstKind::Indexed, &NewVR);
8484 }
8485 break;
8486 }
8487 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
8488 RC = &AArch64::FPR64RegClass;
8489 Opc = AArch64::FMLSv4f16;
8490 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8491 FMAInstKind::Accumulator);
8492 break;
8493 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
8494 RC = &AArch64::FPR64RegClass;
8495 Opc = AArch64::FMLSv4i16_indexed;
8496 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8497 FMAInstKind::Indexed);
8498 break;
8499
8500 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
8501 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
8502 RC = &AArch64::FPR64RegClass;
8503 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
8504 Opc = AArch64::FMLSv2i32_indexed;
8505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8506 FMAInstKind::Indexed);
8507 } else {
8508 Opc = AArch64::FMLSv2f32;
8509 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8510 FMAInstKind::Accumulator);
8511 }
8512 break;
8513
8514 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
8515 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
8516 RC = &AArch64::FPR128RegClass;
8517 Register NewVR = MRI.createVirtualRegister(RC);
8518 MachineInstrBuilder MIB1 =
8519 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
8520 .add(Root.getOperand(2));
8521 InsInstrs.push_back(MIB1);
8522 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8523 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
8524 Opc = AArch64::FMLAv8f16;
8525 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8526 FMAInstKind::Accumulator, &NewVR);
8527 } else {
8528 Opc = AArch64::FMLAv8i16_indexed;
8529 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8530 FMAInstKind::Indexed, &NewVR);
8531 }
8532 break;
8533 }
8534 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
8535 RC = &AArch64::FPR128RegClass;
8536 Opc = AArch64::FMLSv8f16;
8537 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8538 FMAInstKind::Accumulator);
8539 break;
8540 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
8541 RC = &AArch64::FPR128RegClass;
8542 Opc = AArch64::FMLSv8i16_indexed;
8543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8544 FMAInstKind::Indexed);
8545 break;
8546
8547 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
8548 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
8549 RC = &AArch64::FPR128RegClass;
8550 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
8551 Opc = AArch64::FMLSv2i64_indexed;
8552 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8553 FMAInstKind::Indexed);
8554 } else {
8555 Opc = AArch64::FMLSv2f64;
8556 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8557 FMAInstKind::Accumulator);
8558 }
8559 break;
8560
8561 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
8562 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
8563 RC = &AArch64::FPR128RegClass;
8564 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
8565 Opc = AArch64::FMLSv4i32_indexed;
8566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8567 FMAInstKind::Indexed);
8568 } else {
8569 Opc = AArch64::FMLSv4f32;
8570 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
8571 FMAInstKind::Accumulator);
8572 }
8573 break;
8574 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
8575 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
8576 RC = &AArch64::FPR64RegClass;
8577 Register NewVR = MRI.createVirtualRegister(RC);
8578 MachineInstrBuilder MIB1 =
8579 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
8580 .add(Root.getOperand(2));
8581 InsInstrs.push_back(MIB1);
8582 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8583 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
8584 Opc = AArch64::FMLAv2i32_indexed;
8585 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8586 FMAInstKind::Indexed, &NewVR);
8587 } else {
8588 Opc = AArch64::FMLAv2f32;
8589 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8590 FMAInstKind::Accumulator, &NewVR);
8591 }
8592 break;
8593 }
8594 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
8595 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
8596 RC = &AArch64::FPR128RegClass;
8597 Register NewVR = MRI.createVirtualRegister(RC);
8598 MachineInstrBuilder MIB1 =
8599 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
8600 .add(Root.getOperand(2));
8601 InsInstrs.push_back(MIB1);
8602 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8603 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
8604 Opc = AArch64::FMLAv4i32_indexed;
8605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8606 FMAInstKind::Indexed, &NewVR);
8607 } else {
8608 Opc = AArch64::FMLAv4f32;
8609 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8610 FMAInstKind::Accumulator, &NewVR);
8611 }
8612 break;
8613 }
8614 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
8615 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
8616 RC = &AArch64::FPR128RegClass;
8617 Register NewVR = MRI.createVirtualRegister(RC);
8618 MachineInstrBuilder MIB1 =
8619 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
8620 .add(Root.getOperand(2));
8621 InsInstrs.push_back(MIB1);
8622 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
8623 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
8624 Opc = AArch64::FMLAv2i64_indexed;
8625 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8626 FMAInstKind::Indexed, &NewVR);
8627 } else {
8628 Opc = AArch64::FMLAv2f64;
8629 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
8630 FMAInstKind::Accumulator, &NewVR);
8631 }
8632 break;
8633 }
8634 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
8635 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
8636 unsigned IdxDupOp =
8637 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
8638 : 2;
8639 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
8640 &AArch64::FPR128RegClass, MRI);
8641 break;
8642 }
8643 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
8644 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
8645 unsigned IdxDupOp =
8646 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
8647 : 2;
8648 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
8649 &AArch64::FPR128RegClass, MRI);
8650 break;
8651 }
8652 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
8653 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
8654 unsigned IdxDupOp =
8655 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
8656 : 2;
8657 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
8658 &AArch64::FPR128_loRegClass, MRI);
8659 break;
8660 }
8661 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
8662 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
8663 unsigned IdxDupOp =
8664 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
8665 : 2;
8666 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
8667 &AArch64::FPR128RegClass, MRI);
8668 break;
8669 }
8670 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
8671 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
8672 unsigned IdxDupOp =
8673 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
8674 : 2;
8675 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
8676 &AArch64::FPR128_loRegClass, MRI);
8677 break;
8678 }
8679 case AArch64MachineCombinerPattern::FNMADD: {
8680 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
8681 break;
8682 }
8683
8684 } // end switch (Pattern)
8685 // Record MUL and ADD/SUB for deletion
8686 if (MUL)
8687 DelInstrs.push_back(MUL);
8688 DelInstrs.push_back(&Root);
8689
8690 // Set the flags on the inserted instructions to be the merged flags of the
8691 // instructions that we have combined.
8692 uint32_t Flags = Root.getFlags();
8693 if (MUL)
8694 Flags = Root.mergeFlagsWith(*MUL);
8695 for (auto *MI : InsInstrs)
8696 MI->setFlags(Flags);
8697 }
8698
8699 /// Replace csincr-branch sequence by simple conditional branch
8700 ///
8701 /// Examples:
8702 /// 1. \code
8703 /// csinc w9, wzr, wzr, <condition code>
8704 /// tbnz w9, #0, 0x44
8705 /// \endcode
8706 /// to
8707 /// \code
8708 /// b.<inverted condition code>
8709 /// \endcode
8710 ///
8711 /// 2. \code
8712 /// csinc w9, wzr, wzr, <condition code>
8713 /// tbz w9, #0, 0x44
8714 /// \endcode
8715 /// to
8716 /// \code
8717 /// b.<condition code>
8718 /// \endcode
8719 ///
8720 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
8721 /// compare's constant operand is power of 2.
8722 ///
8723 /// Examples:
8724 /// \code
8725 /// and w8, w8, #0x400
8726 /// cbnz w8, L1
8727 /// \endcode
8728 /// to
8729 /// \code
8730 /// tbnz w8, #10, L1
8731 /// \endcode
8732 ///
8733 /// \param MI Conditional Branch
8734 /// \return True when the simple conditional branch is generated
8735 ///
optimizeCondBranch(MachineInstr & MI) const8736 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
8737 bool IsNegativeBranch = false;
8738 bool IsTestAndBranch = false;
8739 unsigned TargetBBInMI = 0;
8740 switch (MI.getOpcode()) {
8741 default:
8742 llvm_unreachable("Unknown branch instruction?");
8743 case AArch64::Bcc:
8744 case AArch64::CBWPri:
8745 case AArch64::CBXPri:
8746 case AArch64::CBWPrr:
8747 case AArch64::CBXPrr:
8748 return false;
8749 case AArch64::CBZW:
8750 case AArch64::CBZX:
8751 TargetBBInMI = 1;
8752 break;
8753 case AArch64::CBNZW:
8754 case AArch64::CBNZX:
8755 TargetBBInMI = 1;
8756 IsNegativeBranch = true;
8757 break;
8758 case AArch64::TBZW:
8759 case AArch64::TBZX:
8760 TargetBBInMI = 2;
8761 IsTestAndBranch = true;
8762 break;
8763 case AArch64::TBNZW:
8764 case AArch64::TBNZX:
8765 TargetBBInMI = 2;
8766 IsNegativeBranch = true;
8767 IsTestAndBranch = true;
8768 break;
8769 }
8770 // So we increment a zero register and test for bits other
8771 // than bit 0? Conservatively bail out in case the verifier
8772 // missed this case.
8773 if (IsTestAndBranch && MI.getOperand(1).getImm())
8774 return false;
8775
8776 // Find Definition.
8777 assert(MI.getParent() && "Incomplete machine instruction\n");
8778 MachineBasicBlock *MBB = MI.getParent();
8779 MachineFunction *MF = MBB->getParent();
8780 MachineRegisterInfo *MRI = &MF->getRegInfo();
8781 Register VReg = MI.getOperand(0).getReg();
8782 if (!VReg.isVirtual())
8783 return false;
8784
8785 MachineInstr *DefMI = MRI->getVRegDef(VReg);
8786
8787 // Look through COPY instructions to find definition.
8788 while (DefMI->isCopy()) {
8789 Register CopyVReg = DefMI->getOperand(1).getReg();
8790 if (!MRI->hasOneNonDBGUse(CopyVReg))
8791 return false;
8792 if (!MRI->hasOneDef(CopyVReg))
8793 return false;
8794 DefMI = MRI->getVRegDef(CopyVReg);
8795 }
8796
8797 switch (DefMI->getOpcode()) {
8798 default:
8799 return false;
8800 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8801 case AArch64::ANDWri:
8802 case AArch64::ANDXri: {
8803 if (IsTestAndBranch)
8804 return false;
8805 if (DefMI->getParent() != MBB)
8806 return false;
8807 if (!MRI->hasOneNonDBGUse(VReg))
8808 return false;
8809
8810 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8811 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8812 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8813 if (!isPowerOf2_64(Mask))
8814 return false;
8815
8816 MachineOperand &MO = DefMI->getOperand(1);
8817 Register NewReg = MO.getReg();
8818 if (!NewReg.isVirtual())
8819 return false;
8820
8821 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8822
8823 MachineBasicBlock &RefToMBB = *MBB;
8824 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8825 DebugLoc DL = MI.getDebugLoc();
8826 unsigned Imm = Log2_64(Mask);
8827 unsigned Opc = (Imm < 32)
8828 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8829 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8830 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8831 .addReg(NewReg)
8832 .addImm(Imm)
8833 .addMBB(TBB);
8834 // Register lives on to the CBZ now.
8835 MO.setIsKill(false);
8836
8837 // For immediate smaller than 32, we need to use the 32-bit
8838 // variant (W) in all cases. Indeed the 64-bit variant does not
8839 // allow to encode them.
8840 // Therefore, if the input register is 64-bit, we need to take the
8841 // 32-bit sub-part.
8842 if (!Is32Bit && Imm < 32)
8843 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8844 MI.eraseFromParent();
8845 return true;
8846 }
8847 // Look for CSINC
8848 case AArch64::CSINCWr:
8849 case AArch64::CSINCXr: {
8850 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8851 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8852 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8853 DefMI->getOperand(2).getReg() == AArch64::XZR))
8854 return false;
8855
8856 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8857 true) != -1)
8858 return false;
8859
8860 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8861 // Convert only when the condition code is not modified between
8862 // the CSINC and the branch. The CC may be used by other
8863 // instructions in between.
8864 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8865 return false;
8866 MachineBasicBlock &RefToMBB = *MBB;
8867 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8868 DebugLoc DL = MI.getDebugLoc();
8869 if (IsNegativeBranch)
8870 CC = AArch64CC::getInvertedCondCode(CC);
8871 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8872 MI.eraseFromParent();
8873 return true;
8874 }
8875 }
8876 }
8877
8878 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8879 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8880 const unsigned Mask = AArch64II::MO_FRAGMENT;
8881 return std::make_pair(TF & Mask, TF & ~Mask);
8882 }
8883
8884 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8885 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8886 using namespace AArch64II;
8887
8888 static const std::pair<unsigned, const char *> TargetFlags[] = {
8889 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8890 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8891 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8892 {MO_HI12, "aarch64-hi12"}};
8893 return ArrayRef(TargetFlags);
8894 }
8895
8896 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8897 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8898 using namespace AArch64II;
8899
8900 static const std::pair<unsigned, const char *> TargetFlags[] = {
8901 {MO_COFFSTUB, "aarch64-coffstub"},
8902 {MO_GOT, "aarch64-got"},
8903 {MO_NC, "aarch64-nc"},
8904 {MO_S, "aarch64-s"},
8905 {MO_TLS, "aarch64-tls"},
8906 {MO_DLLIMPORT, "aarch64-dllimport"},
8907 {MO_PREL, "aarch64-prel"},
8908 {MO_TAGGED, "aarch64-tagged"},
8909 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8910 };
8911 return ArrayRef(TargetFlags);
8912 }
8913
8914 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8915 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8916 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8917 {{MOSuppressPair, "aarch64-suppress-pair"},
8918 {MOStridedAccess, "aarch64-strided-access"}};
8919 return ArrayRef(TargetFlags);
8920 }
8921
8922 /// Constants defining how certain sequences should be outlined.
8923 /// This encompasses how an outlined function should be called, and what kind of
8924 /// frame should be emitted for that outlined function.
8925 ///
8926 /// \p MachineOutlinerDefault implies that the function should be called with
8927 /// a save and restore of LR to the stack.
8928 ///
8929 /// That is,
8930 ///
8931 /// I1 Save LR OUTLINED_FUNCTION:
8932 /// I2 --> BL OUTLINED_FUNCTION I1
8933 /// I3 Restore LR I2
8934 /// I3
8935 /// RET
8936 ///
8937 /// * Call construction overhead: 3 (save + BL + restore)
8938 /// * Frame construction overhead: 1 (ret)
8939 /// * Requires stack fixups? Yes
8940 ///
8941 /// \p MachineOutlinerTailCall implies that the function is being created from
8942 /// a sequence of instructions ending in a return.
8943 ///
8944 /// That is,
8945 ///
8946 /// I1 OUTLINED_FUNCTION:
8947 /// I2 --> B OUTLINED_FUNCTION I1
8948 /// RET I2
8949 /// RET
8950 ///
8951 /// * Call construction overhead: 1 (B)
8952 /// * Frame construction overhead: 0 (Return included in sequence)
8953 /// * Requires stack fixups? No
8954 ///
8955 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8956 /// a BL instruction, but doesn't require LR to be saved and restored. This
8957 /// happens when LR is known to be dead.
8958 ///
8959 /// That is,
8960 ///
8961 /// I1 OUTLINED_FUNCTION:
8962 /// I2 --> BL OUTLINED_FUNCTION I1
8963 /// I3 I2
8964 /// I3
8965 /// RET
8966 ///
8967 /// * Call construction overhead: 1 (BL)
8968 /// * Frame construction overhead: 1 (RET)
8969 /// * Requires stack fixups? No
8970 ///
8971 /// \p MachineOutlinerThunk implies that the function is being created from
8972 /// a sequence of instructions ending in a call. The outlined function is
8973 /// called with a BL instruction, and the outlined function tail-calls the
8974 /// original call destination.
8975 ///
8976 /// That is,
8977 ///
8978 /// I1 OUTLINED_FUNCTION:
8979 /// I2 --> BL OUTLINED_FUNCTION I1
8980 /// BL f I2
8981 /// B f
8982 /// * Call construction overhead: 1 (BL)
8983 /// * Frame construction overhead: 0
8984 /// * Requires stack fixups? No
8985 ///
8986 /// \p MachineOutlinerRegSave implies that the function should be called with a
8987 /// save and restore of LR to an available register. This allows us to avoid
8988 /// stack fixups. Note that this outlining variant is compatible with the
8989 /// NoLRSave case.
8990 ///
8991 /// That is,
8992 ///
8993 /// I1 Save LR OUTLINED_FUNCTION:
8994 /// I2 --> BL OUTLINED_FUNCTION I1
8995 /// I3 Restore LR I2
8996 /// I3
8997 /// RET
8998 ///
8999 /// * Call construction overhead: 3 (save + BL + restore)
9000 /// * Frame construction overhead: 1 (ret)
9001 /// * Requires stack fixups? No
9002 enum MachineOutlinerClass {
9003 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
9004 MachineOutlinerTailCall, /// Only emit a branch.
9005 MachineOutlinerNoLRSave, /// Emit a call and return.
9006 MachineOutlinerThunk, /// Emit a call and tail-call.
9007 MachineOutlinerRegSave /// Same as default, but save to a register.
9008 };
9009
9010 enum MachineOutlinerMBBFlags {
9011 LRUnavailableSomewhere = 0x2,
9012 HasCalls = 0x4,
9013 UnsafeRegsDead = 0x8
9014 };
9015
9016 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const9017 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
9018 MachineFunction *MF = C.getMF();
9019 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
9020 const AArch64RegisterInfo *ARI =
9021 static_cast<const AArch64RegisterInfo *>(&TRI);
9022 // Check if there is an available register across the sequence that we can
9023 // use.
9024 for (unsigned Reg : AArch64::GPR64RegClass) {
9025 if (!ARI->isReservedReg(*MF, Reg) &&
9026 Reg != AArch64::LR && // LR is not reserved, but don't use it.
9027 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
9028 Reg != AArch64::X17 && // Ditto for X17.
9029 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
9030 C.isAvailableInsideSeq(Reg, TRI))
9031 return Reg;
9032 }
9033 return Register();
9034 }
9035
9036 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9037 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
9038 const outliner::Candidate &b) {
9039 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9040 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9041
9042 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
9043 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
9044 }
9045
9046 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9047 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
9048 const outliner::Candidate &b) {
9049 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
9050 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
9051
9052 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
9053 }
9054
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)9055 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
9056 const outliner::Candidate &b) {
9057 const AArch64Subtarget &SubtargetA =
9058 a.getMF()->getSubtarget<AArch64Subtarget>();
9059 const AArch64Subtarget &SubtargetB =
9060 b.getMF()->getSubtarget<AArch64Subtarget>();
9061 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
9062 }
9063
9064 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
getOutliningCandidateInfo(const MachineModuleInfo & MMI,std::vector<outliner::Candidate> & RepeatedSequenceLocs,unsigned MinRepeats) const9065 AArch64InstrInfo::getOutliningCandidateInfo(
9066 const MachineModuleInfo &MMI,
9067 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
9068 unsigned MinRepeats) const {
9069 unsigned SequenceSize = 0;
9070 for (auto &MI : RepeatedSequenceLocs[0])
9071 SequenceSize += getInstSizeInBytes(MI);
9072
9073 unsigned NumBytesToCreateFrame = 0;
9074
9075 // We only allow outlining for functions having exactly matching return
9076 // address signing attributes, i.e., all share the same value for the
9077 // attribute "sign-return-address" and all share the same type of key they
9078 // are signed with.
9079 // Additionally we require all functions to simultaneously either support
9080 // v8.3a features or not. Otherwise an outlined function could get signed
9081 // using dedicated v8.3 instructions and a call from a function that doesn't
9082 // support v8.3 instructions would therefore be invalid.
9083 if (std::adjacent_find(
9084 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
9085 [](const outliner::Candidate &a, const outliner::Candidate &b) {
9086 // Return true if a and b are non-equal w.r.t. return address
9087 // signing or support of v8.3a features
9088 if (outliningCandidatesSigningScopeConsensus(a, b) &&
9089 outliningCandidatesSigningKeyConsensus(a, b) &&
9090 outliningCandidatesV8_3OpsConsensus(a, b)) {
9091 return false;
9092 }
9093 return true;
9094 }) != RepeatedSequenceLocs.end()) {
9095 return std::nullopt;
9096 }
9097
9098 // Since at this point all candidates agree on their return address signing
9099 // picking just one is fine. If the candidate functions potentially sign their
9100 // return addresses, the outlined function should do the same. Note that in
9101 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
9102 // not certainly true that the outlined function will have to sign its return
9103 // address but this decision is made later, when the decision to outline
9104 // has already been made.
9105 // The same holds for the number of additional instructions we need: On
9106 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
9107 // necessary. However, at this point we don't know if the outlined function
9108 // will have a RET instruction so we assume the worst.
9109 const TargetRegisterInfo &TRI = getRegisterInfo();
9110 // Performing a tail call may require extra checks when PAuth is enabled.
9111 // If PAuth is disabled, set it to zero for uniformity.
9112 unsigned NumBytesToCheckLRInTCEpilogue = 0;
9113 if (RepeatedSequenceLocs[0]
9114 .getMF()
9115 ->getInfo<AArch64FunctionInfo>()
9116 ->shouldSignReturnAddress(true)) {
9117 // One PAC and one AUT instructions
9118 NumBytesToCreateFrame += 8;
9119
9120 // PAuth is enabled - set extra tail call cost, if any.
9121 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
9122 *RepeatedSequenceLocs[0].getMF());
9123 NumBytesToCheckLRInTCEpilogue =
9124 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
9125 // Checking the authenticated LR value may significantly impact
9126 // SequenceSize, so account for it for more precise results.
9127 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
9128 SequenceSize += NumBytesToCheckLRInTCEpilogue;
9129
9130 // We have to check if sp modifying instructions would get outlined.
9131 // If so we only allow outlining if sp is unchanged overall, so matching
9132 // sub and add instructions are okay to outline, all other sp modifications
9133 // are not
9134 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
9135 int SPValue = 0;
9136 for (auto &MI : C) {
9137 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
9138 switch (MI.getOpcode()) {
9139 case AArch64::ADDXri:
9140 case AArch64::ADDWri:
9141 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9142 assert(MI.getOperand(2).isImm() &&
9143 "Expected operand to be immediate");
9144 assert(MI.getOperand(1).isReg() &&
9145 "Expected operand to be a register");
9146 // Check if the add just increments sp. If so, we search for
9147 // matching sub instructions that decrement sp. If not, the
9148 // modification is illegal
9149 if (MI.getOperand(1).getReg() == AArch64::SP)
9150 SPValue += MI.getOperand(2).getImm();
9151 else
9152 return true;
9153 break;
9154 case AArch64::SUBXri:
9155 case AArch64::SUBWri:
9156 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
9157 assert(MI.getOperand(2).isImm() &&
9158 "Expected operand to be immediate");
9159 assert(MI.getOperand(1).isReg() &&
9160 "Expected operand to be a register");
9161 // Check if the sub just decrements sp. If so, we search for
9162 // matching add instructions that increment sp. If not, the
9163 // modification is illegal
9164 if (MI.getOperand(1).getReg() == AArch64::SP)
9165 SPValue -= MI.getOperand(2).getImm();
9166 else
9167 return true;
9168 break;
9169 default:
9170 return true;
9171 }
9172 }
9173 }
9174 if (SPValue)
9175 return true;
9176 return false;
9177 };
9178 // Remove candidates with illegal stack modifying instructions
9179 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
9180
9181 // If the sequence doesn't have enough candidates left, then we're done.
9182 if (RepeatedSequenceLocs.size() < MinRepeats)
9183 return std::nullopt;
9184 }
9185
9186 // Properties about candidate MBBs that hold for all of them.
9187 unsigned FlagsSetInAll = 0xF;
9188
9189 // Compute liveness information for each candidate, and set FlagsSetInAll.
9190 for (outliner::Candidate &C : RepeatedSequenceLocs)
9191 FlagsSetInAll &= C.Flags;
9192
9193 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
9194
9195 // Helper lambda which sets call information for every candidate.
9196 auto SetCandidateCallInfo =
9197 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
9198 for (outliner::Candidate &C : RepeatedSequenceLocs)
9199 C.setCallInfo(CallID, NumBytesForCall);
9200 };
9201
9202 unsigned FrameID = MachineOutlinerDefault;
9203 NumBytesToCreateFrame += 4;
9204
9205 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
9206 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
9207 });
9208
9209 // We check to see if CFI Instructions are present, and if they are
9210 // we find the number of CFI Instructions in the candidates.
9211 unsigned CFICount = 0;
9212 for (auto &I : RepeatedSequenceLocs[0]) {
9213 if (I.isCFIInstruction())
9214 CFICount++;
9215 }
9216
9217 // We compare the number of found CFI Instructions to the number of CFI
9218 // instructions in the parent function for each candidate. We must check this
9219 // since if we outline one of the CFI instructions in a function, we have to
9220 // outline them all for correctness. If we do not, the address offsets will be
9221 // incorrect between the two sections of the program.
9222 for (outliner::Candidate &C : RepeatedSequenceLocs) {
9223 std::vector<MCCFIInstruction> CFIInstructions =
9224 C.getMF()->getFrameInstructions();
9225
9226 if (CFICount > 0 && CFICount != CFIInstructions.size())
9227 return std::nullopt;
9228 }
9229
9230 // Returns true if an instructions is safe to fix up, false otherwise.
9231 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
9232 if (MI.isCall())
9233 return true;
9234
9235 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
9236 !MI.readsRegister(AArch64::SP, &TRI))
9237 return true;
9238
9239 // Any modification of SP will break our code to save/restore LR.
9240 // FIXME: We could handle some instructions which add a constant
9241 // offset to SP, with a bit more work.
9242 if (MI.modifiesRegister(AArch64::SP, &TRI))
9243 return false;
9244
9245 // At this point, we have a stack instruction that we might need to
9246 // fix up. We'll handle it if it's a load or store.
9247 if (MI.mayLoadOrStore()) {
9248 const MachineOperand *Base; // Filled with the base operand of MI.
9249 int64_t Offset; // Filled with the offset of MI.
9250 bool OffsetIsScalable;
9251
9252 // Does it allow us to offset the base operand and is the base the
9253 // register SP?
9254 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
9255 !Base->isReg() || Base->getReg() != AArch64::SP)
9256 return false;
9257
9258 // Fixe-up code below assumes bytes.
9259 if (OffsetIsScalable)
9260 return false;
9261
9262 // Find the minimum/maximum offset for this instruction and check
9263 // if fixing it up would be in range.
9264 int64_t MinOffset,
9265 MaxOffset; // Unscaled offsets for the instruction.
9266 // The scale to multiply the offsets by.
9267 TypeSize Scale(0U, false), DummyWidth(0U, false);
9268 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
9269
9270 Offset += 16; // Update the offset to what it would be if we outlined.
9271 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
9272 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
9273 return false;
9274
9275 // It's in range, so we can outline it.
9276 return true;
9277 }
9278
9279 // FIXME: Add handling for instructions like "add x0, sp, #8".
9280
9281 // We can't fix it up, so don't outline it.
9282 return false;
9283 };
9284
9285 // True if it's possible to fix up each stack instruction in this sequence.
9286 // Important for frames/call variants that modify the stack.
9287 bool AllStackInstrsSafe =
9288 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
9289
9290 // If the last instruction in any candidate is a terminator, then we should
9291 // tail call all of the candidates.
9292 if (RepeatedSequenceLocs[0].back().isTerminator()) {
9293 FrameID = MachineOutlinerTailCall;
9294 NumBytesToCreateFrame = 0;
9295 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
9296 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
9297 }
9298
9299 else if (LastInstrOpcode == AArch64::BL ||
9300 ((LastInstrOpcode == AArch64::BLR ||
9301 LastInstrOpcode == AArch64::BLRNoIP) &&
9302 !HasBTI)) {
9303 // FIXME: Do we need to check if the code after this uses the value of LR?
9304 FrameID = MachineOutlinerThunk;
9305 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
9306 SetCandidateCallInfo(MachineOutlinerThunk, 4);
9307 }
9308
9309 else {
9310 // We need to decide how to emit calls + frames. We can always emit the same
9311 // frame if we don't need to save to the stack. If we have to save to the
9312 // stack, then we need a different frame.
9313 unsigned NumBytesNoStackCalls = 0;
9314 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
9315
9316 // Check if we have to save LR.
9317 for (outliner::Candidate &C : RepeatedSequenceLocs) {
9318 bool LRAvailable =
9319 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
9320 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
9321 : true;
9322 // If we have a noreturn caller, then we're going to be conservative and
9323 // say that we have to save LR. If we don't have a ret at the end of the
9324 // block, then we can't reason about liveness accurately.
9325 //
9326 // FIXME: We can probably do better than always disabling this in
9327 // noreturn functions by fixing up the liveness info.
9328 bool IsNoReturn =
9329 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
9330
9331 // Is LR available? If so, we don't need a save.
9332 if (LRAvailable && !IsNoReturn) {
9333 NumBytesNoStackCalls += 4;
9334 C.setCallInfo(MachineOutlinerNoLRSave, 4);
9335 CandidatesWithoutStackFixups.push_back(C);
9336 }
9337
9338 // Is an unused register available? If so, we won't modify the stack, so
9339 // we can outline with the same frame type as those that don't save LR.
9340 else if (findRegisterToSaveLRTo(C)) {
9341 NumBytesNoStackCalls += 12;
9342 C.setCallInfo(MachineOutlinerRegSave, 12);
9343 CandidatesWithoutStackFixups.push_back(C);
9344 }
9345
9346 // Is SP used in the sequence at all? If not, we don't have to modify
9347 // the stack, so we are guaranteed to get the same frame.
9348 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
9349 NumBytesNoStackCalls += 12;
9350 C.setCallInfo(MachineOutlinerDefault, 12);
9351 CandidatesWithoutStackFixups.push_back(C);
9352 }
9353
9354 // If we outline this, we need to modify the stack. Pretend we don't
9355 // outline this by saving all of its bytes.
9356 else {
9357 NumBytesNoStackCalls += SequenceSize;
9358 }
9359 }
9360
9361 // If there are no places where we have to save LR, then note that we
9362 // don't have to update the stack. Otherwise, give every candidate the
9363 // default call type, as long as it's safe to do so.
9364 if (!AllStackInstrsSafe ||
9365 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
9366 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
9367 FrameID = MachineOutlinerNoLRSave;
9368 if (RepeatedSequenceLocs.size() < MinRepeats)
9369 return std::nullopt;
9370 } else {
9371 SetCandidateCallInfo(MachineOutlinerDefault, 12);
9372
9373 // Bugzilla ID: 46767
9374 // TODO: Check if fixing up the stack more than once is safe so we can
9375 // outline these.
9376 //
9377 // An outline resulting in a caller that requires stack fixups at the
9378 // callsite to a callee that also requires stack fixups can happen when
9379 // there are no available registers at the candidate callsite for a
9380 // candidate that itself also has calls.
9381 //
9382 // In other words if function_containing_sequence in the following pseudo
9383 // assembly requires that we save LR at the point of the call, but there
9384 // are no available registers: in this case we save using SP and as a
9385 // result the SP offsets requires stack fixups by multiples of 16.
9386 //
9387 // function_containing_sequence:
9388 // ...
9389 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9390 // call OUTLINED_FUNCTION_N
9391 // restore LR from SP
9392 // ...
9393 //
9394 // OUTLINED_FUNCTION_N:
9395 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
9396 // ...
9397 // bl foo
9398 // restore LR from SP
9399 // ret
9400 //
9401 // Because the code to handle more than one stack fixup does not
9402 // currently have the proper checks for legality, these cases will assert
9403 // in the AArch64 MachineOutliner. This is because the code to do this
9404 // needs more hardening, testing, better checks that generated code is
9405 // legal, etc and because it is only verified to handle a single pass of
9406 // stack fixup.
9407 //
9408 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
9409 // these cases until they are known to be handled. Bugzilla 46767 is
9410 // referenced in comments at the assert site.
9411 //
9412 // To avoid asserting (or generating non-legal code on noassert builds)
9413 // we remove all candidates which would need more than one stack fixup by
9414 // pruning the cases where the candidate has calls while also having no
9415 // available LR and having no available general purpose registers to copy
9416 // LR to (ie one extra stack save/restore).
9417 //
9418 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9419 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
9420 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
9421 return (llvm::any_of(C, IsCall)) &&
9422 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
9423 !findRegisterToSaveLRTo(C));
9424 });
9425 }
9426 }
9427
9428 // If we dropped all of the candidates, bail out here.
9429 if (RepeatedSequenceLocs.size() < MinRepeats)
9430 return std::nullopt;
9431 }
9432
9433 // Does every candidate's MBB contain a call? If so, then we might have a call
9434 // in the range.
9435 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
9436 // Check if the range contains a call. These require a save + restore of the
9437 // link register.
9438 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
9439 bool ModStackToSaveLR = false;
9440 if (any_of(drop_end(FirstCand),
9441 [](const MachineInstr &MI) { return MI.isCall(); }))
9442 ModStackToSaveLR = true;
9443
9444 // Handle the last instruction separately. If this is a tail call, then the
9445 // last instruction is a call. We don't want to save + restore in this case.
9446 // However, it could be possible that the last instruction is a call without
9447 // it being valid to tail call this sequence. We should consider this as
9448 // well.
9449 else if (FrameID != MachineOutlinerThunk &&
9450 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
9451 ModStackToSaveLR = true;
9452
9453 if (ModStackToSaveLR) {
9454 // We can't fix up the stack. Bail out.
9455 if (!AllStackInstrsSafe)
9456 return std::nullopt;
9457
9458 // Save + restore LR.
9459 NumBytesToCreateFrame += 8;
9460 }
9461 }
9462
9463 // If we have CFI instructions, we can only outline if the outlined section
9464 // can be a tail call
9465 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
9466 return std::nullopt;
9467
9468 return std::make_unique<outliner::OutlinedFunction>(
9469 RepeatedSequenceLocs, SequenceSize, NumBytesToCreateFrame, FrameID);
9470 }
9471
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const9472 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
9473 Function &F, std::vector<outliner::Candidate> &Candidates) const {
9474 // If a bunch of candidates reach this point they must agree on their return
9475 // address signing. It is therefore enough to just consider the signing
9476 // behaviour of one of them
9477 const auto &CFn = Candidates.front().getMF()->getFunction();
9478
9479 if (CFn.hasFnAttribute("ptrauth-returns"))
9480 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
9481 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
9482 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
9483 // Since all candidates belong to the same module, just copy the
9484 // function-level attributes of an arbitrary function.
9485 if (CFn.hasFnAttribute("sign-return-address"))
9486 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
9487 if (CFn.hasFnAttribute("sign-return-address-key"))
9488 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
9489
9490 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
9491 }
9492
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const9493 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
9494 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
9495 const Function &F = MF.getFunction();
9496
9497 // Can F be deduplicated by the linker? If it can, don't outline from it.
9498 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
9499 return false;
9500
9501 // Don't outline from functions with section markings; the program could
9502 // expect that all the code is in the named section.
9503 // FIXME: Allow outlining from multiple functions with the same section
9504 // marking.
9505 if (F.hasSection())
9506 return false;
9507
9508 // Outlining from functions with redzones is unsafe since the outliner may
9509 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
9510 // outline from it.
9511 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
9512 if (!AFI || AFI->hasRedZone().value_or(true))
9513 return false;
9514
9515 // FIXME: Determine whether it is safe to outline from functions which contain
9516 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
9517 // outlined together and ensure it is safe to outline with async unwind info,
9518 // required for saving & restoring VG around calls.
9519 if (AFI->hasStreamingModeChanges())
9520 return false;
9521
9522 // FIXME: Teach the outliner to generate/handle Windows unwind info.
9523 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
9524 return false;
9525
9526 // It's safe to outline from MF.
9527 return true;
9528 }
9529
9530 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const9531 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
9532 unsigned &Flags) const {
9533 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
9534 "Must track liveness!");
9535 SmallVector<
9536 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
9537 Ranges;
9538 // According to the AArch64 Procedure Call Standard, the following are
9539 // undefined on entry/exit from a function call:
9540 //
9541 // * Registers x16, x17, (and thus w16, w17)
9542 // * Condition codes (and thus the NZCV register)
9543 //
9544 // If any of these registers are used inside or live across an outlined
9545 // function, then they may be modified later, either by the compiler or
9546 // some other tool (like the linker).
9547 //
9548 // To avoid outlining in these situations, partition each block into ranges
9549 // where these registers are dead. We will only outline from those ranges.
9550 LiveRegUnits LRU(getRegisterInfo());
9551 auto AreAllUnsafeRegsDead = [&LRU]() {
9552 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
9553 LRU.available(AArch64::NZCV);
9554 };
9555
9556 // We need to know if LR is live across an outlining boundary later on in
9557 // order to decide how we'll create the outlined call, frame, etc.
9558 //
9559 // It's pretty expensive to check this for *every candidate* within a block.
9560 // That's some potentially n^2 behaviour, since in the worst case, we'd need
9561 // to compute liveness from the end of the block for O(n) candidates within
9562 // the block.
9563 //
9564 // So, to improve the average case, let's keep track of liveness from the end
9565 // of the block to the beginning of *every outlinable range*. If we know that
9566 // LR is available in every range we could outline from, then we know that
9567 // we don't need to check liveness for any candidate within that range.
9568 bool LRAvailableEverywhere = true;
9569 // Compute liveness bottom-up.
9570 LRU.addLiveOuts(MBB);
9571 // Update flags that require info about the entire MBB.
9572 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
9573 if (MI.isCall() && !MI.isTerminator())
9574 Flags |= MachineOutlinerMBBFlags::HasCalls;
9575 };
9576 // Range: [RangeBegin, RangeEnd)
9577 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
9578 unsigned RangeLen;
9579 auto CreateNewRangeStartingAt =
9580 [&RangeBegin, &RangeEnd,
9581 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
9582 RangeBegin = NewBegin;
9583 RangeEnd = std::next(RangeBegin);
9584 RangeLen = 0;
9585 };
9586 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
9587 // At least one unsafe register is not dead. We do not want to outline at
9588 // this point. If it is long enough to outline from, save the range
9589 // [RangeBegin, RangeEnd).
9590 if (RangeLen > 1)
9591 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
9592 };
9593 // Find the first point where all unsafe registers are dead.
9594 // FIND: <safe instr> <-- end of first potential range
9595 // SKIP: <unsafe def>
9596 // SKIP: ... everything between ...
9597 // SKIP: <unsafe use>
9598 auto FirstPossibleEndPt = MBB.instr_rbegin();
9599 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
9600 LRU.stepBackward(*FirstPossibleEndPt);
9601 // Update flags that impact how we outline across the entire block,
9602 // regardless of safety.
9603 UpdateWholeMBBFlags(*FirstPossibleEndPt);
9604 if (AreAllUnsafeRegsDead())
9605 break;
9606 }
9607 // If we exhausted the entire block, we have no safe ranges to outline.
9608 if (FirstPossibleEndPt == MBB.instr_rend())
9609 return Ranges;
9610 // Current range.
9611 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
9612 // StartPt points to the first place where all unsafe registers
9613 // are dead (if there is any such point). Begin partitioning the MBB into
9614 // ranges.
9615 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
9616 LRU.stepBackward(MI);
9617 UpdateWholeMBBFlags(MI);
9618 if (!AreAllUnsafeRegsDead()) {
9619 SaveRangeIfNonEmpty();
9620 CreateNewRangeStartingAt(MI.getIterator());
9621 continue;
9622 }
9623 LRAvailableEverywhere &= LRU.available(AArch64::LR);
9624 RangeBegin = MI.getIterator();
9625 ++RangeLen;
9626 }
9627 // Above loop misses the last (or only) range. If we are still safe, then
9628 // let's save the range.
9629 if (AreAllUnsafeRegsDead())
9630 SaveRangeIfNonEmpty();
9631 if (Ranges.empty())
9632 return Ranges;
9633 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
9634 // the order.
9635 std::reverse(Ranges.begin(), Ranges.end());
9636 // If there is at least one outlinable range where LR is unavailable
9637 // somewhere, remember that.
9638 if (!LRAvailableEverywhere)
9639 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
9640 return Ranges;
9641 }
9642
9643 outliner::InstrType
getOutliningTypeImpl(const MachineModuleInfo & MMI,MachineBasicBlock::iterator & MIT,unsigned Flags) const9644 AArch64InstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
9645 MachineBasicBlock::iterator &MIT,
9646 unsigned Flags) const {
9647 MachineInstr &MI = *MIT;
9648
9649 // Don't outline anything used for return address signing. The outlined
9650 // function will get signed later if needed
9651 switch (MI.getOpcode()) {
9652 case AArch64::PACM:
9653 case AArch64::PACIASP:
9654 case AArch64::PACIBSP:
9655 case AArch64::PACIASPPC:
9656 case AArch64::PACIBSPPC:
9657 case AArch64::AUTIASP:
9658 case AArch64::AUTIBSP:
9659 case AArch64::AUTIASPPCi:
9660 case AArch64::AUTIASPPCr:
9661 case AArch64::AUTIBSPPCi:
9662 case AArch64::AUTIBSPPCr:
9663 case AArch64::RETAA:
9664 case AArch64::RETAB:
9665 case AArch64::RETAASPPCi:
9666 case AArch64::RETAASPPCr:
9667 case AArch64::RETABSPPCi:
9668 case AArch64::RETABSPPCr:
9669 case AArch64::EMITBKEY:
9670 case AArch64::PAUTH_PROLOGUE:
9671 case AArch64::PAUTH_EPILOGUE:
9672 return outliner::InstrType::Illegal;
9673 }
9674
9675 // We can only outline these if we will tail call the outlined function, or
9676 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
9677 // in a tail call.
9678 //
9679 // FIXME: If the proper fixups for the offset are implemented, this should be
9680 // possible.
9681 if (MI.isCFIInstruction())
9682 return outliner::InstrType::Legal;
9683
9684 // Is this a terminator for a basic block?
9685 if (MI.isTerminator())
9686 // TargetInstrInfo::getOutliningType has already filtered out anything
9687 // that would break this, so we can allow it here.
9688 return outliner::InstrType::Legal;
9689
9690 // Make sure none of the operands are un-outlinable.
9691 for (const MachineOperand &MOP : MI.operands()) {
9692 // A check preventing CFI indices was here before, but only CFI
9693 // instructions should have those.
9694 assert(!MOP.isCFIIndex());
9695
9696 // If it uses LR or W30 explicitly, then don't touch it.
9697 if (MOP.isReg() && !MOP.isImplicit() &&
9698 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
9699 return outliner::InstrType::Illegal;
9700 }
9701
9702 // Special cases for instructions that can always be outlined, but will fail
9703 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
9704 // be outlined because they don't require a *specific* value to be in LR.
9705 if (MI.getOpcode() == AArch64::ADRP)
9706 return outliner::InstrType::Legal;
9707
9708 // If MI is a call we might be able to outline it. We don't want to outline
9709 // any calls that rely on the position of items on the stack. When we outline
9710 // something containing a call, we have to emit a save and restore of LR in
9711 // the outlined function. Currently, this always happens by saving LR to the
9712 // stack. Thus, if we outline, say, half the parameters for a function call
9713 // plus the call, then we'll break the callee's expectations for the layout
9714 // of the stack.
9715 //
9716 // FIXME: Allow calls to functions which construct a stack frame, as long
9717 // as they don't access arguments on the stack.
9718 // FIXME: Figure out some way to analyze functions defined in other modules.
9719 // We should be able to compute the memory usage based on the IR calling
9720 // convention, even if we can't see the definition.
9721 if (MI.isCall()) {
9722 // Get the function associated with the call. Look at each operand and find
9723 // the one that represents the callee and get its name.
9724 const Function *Callee = nullptr;
9725 for (const MachineOperand &MOP : MI.operands()) {
9726 if (MOP.isGlobal()) {
9727 Callee = dyn_cast<Function>(MOP.getGlobal());
9728 break;
9729 }
9730 }
9731
9732 // Never outline calls to mcount. There isn't any rule that would require
9733 // this, but the Linux kernel's "ftrace" feature depends on it.
9734 if (Callee && Callee->getName() == "\01_mcount")
9735 return outliner::InstrType::Illegal;
9736
9737 // If we don't know anything about the callee, assume it depends on the
9738 // stack layout of the caller. In that case, it's only legal to outline
9739 // as a tail-call. Explicitly list the call instructions we know about so we
9740 // don't get unexpected results with call pseudo-instructions.
9741 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
9742 if (MI.getOpcode() == AArch64::BLR ||
9743 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
9744 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
9745
9746 if (!Callee)
9747 return UnknownCallOutlineType;
9748
9749 // We have a function we have information about. Check it if it's something
9750 // can safely outline.
9751 MachineFunction *CalleeMF = MMI.getMachineFunction(*Callee);
9752
9753 // We don't know what's going on with the callee at all. Don't touch it.
9754 if (!CalleeMF)
9755 return UnknownCallOutlineType;
9756
9757 // Check if we know anything about the callee saves on the function. If we
9758 // don't, then don't touch it, since that implies that we haven't
9759 // computed anything about its stack frame yet.
9760 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
9761 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
9762 MFI.getNumObjects() > 0)
9763 return UnknownCallOutlineType;
9764
9765 // At this point, we can say that CalleeMF ought to not pass anything on the
9766 // stack. Therefore, we can outline it.
9767 return outliner::InstrType::Legal;
9768 }
9769
9770 // Don't touch the link register or W30.
9771 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
9772 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9773 return outliner::InstrType::Illegal;
9774
9775 // Don't outline BTI instructions, because that will prevent the outlining
9776 // site from being indirectly callable.
9777 if (hasBTISemantics(MI))
9778 return outliner::InstrType::Illegal;
9779
9780 return outliner::InstrType::Legal;
9781 }
9782
fixupPostOutline(MachineBasicBlock & MBB) const9783 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9784 for (MachineInstr &MI : MBB) {
9785 const MachineOperand *Base;
9786 TypeSize Width(0, false);
9787 int64_t Offset;
9788 bool OffsetIsScalable;
9789
9790 // Is this a load or store with an immediate offset with SP as the base?
9791 if (!MI.mayLoadOrStore() ||
9792 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9793 &RI) ||
9794 (Base->isReg() && Base->getReg() != AArch64::SP))
9795 continue;
9796
9797 // It is, so we have to fix it up.
9798 TypeSize Scale(0U, false);
9799 int64_t Dummy1, Dummy2;
9800
9801 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9802 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9803 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9804 assert(Scale != 0 && "Unexpected opcode!");
9805 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9806
9807 // We've pushed the return address to the stack, so add 16 to the offset.
9808 // This is safe, since we already checked if it would overflow when we
9809 // checked if this instruction was legal to outline.
9810 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9811 StackOffsetOperand.setImm(NewImm);
9812 }
9813 }
9814
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)9815 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9816 const AArch64InstrInfo *TII,
9817 bool ShouldSignReturnAddr) {
9818 if (!ShouldSignReturnAddr)
9819 return;
9820
9821 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9822 .setMIFlag(MachineInstr::FrameSetup);
9823 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9824 TII->get(AArch64::PAUTH_EPILOGUE))
9825 .setMIFlag(MachineInstr::FrameDestroy);
9826 }
9827
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const9828 void AArch64InstrInfo::buildOutlinedFrame(
9829 MachineBasicBlock &MBB, MachineFunction &MF,
9830 const outliner::OutlinedFunction &OF) const {
9831
9832 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9833
9834 if (OF.FrameConstructionID == MachineOutlinerTailCall)
9835 FI->setOutliningStyle("Tail Call");
9836 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9837 // For thunk outlining, rewrite the last instruction from a call to a
9838 // tail-call.
9839 MachineInstr *Call = &*--MBB.instr_end();
9840 unsigned TailOpcode;
9841 if (Call->getOpcode() == AArch64::BL) {
9842 TailOpcode = AArch64::TCRETURNdi;
9843 } else {
9844 assert(Call->getOpcode() == AArch64::BLR ||
9845 Call->getOpcode() == AArch64::BLRNoIP);
9846 TailOpcode = AArch64::TCRETURNriALL;
9847 }
9848 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9849 .add(Call->getOperand(0))
9850 .addImm(0);
9851 MBB.insert(MBB.end(), TC);
9852 Call->eraseFromParent();
9853
9854 FI->setOutliningStyle("Thunk");
9855 }
9856
9857 bool IsLeafFunction = true;
9858
9859 // Is there a call in the outlined range?
9860 auto IsNonTailCall = [](const MachineInstr &MI) {
9861 return MI.isCall() && !MI.isReturn();
9862 };
9863
9864 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9865 // Fix up the instructions in the range, since we're going to modify the
9866 // stack.
9867
9868 // Bugzilla ID: 46767
9869 // TODO: Check if fixing up twice is safe so we can outline these.
9870 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9871 "Can only fix up stack references once");
9872 fixupPostOutline(MBB);
9873
9874 IsLeafFunction = false;
9875
9876 // LR has to be a live in so that we can save it.
9877 if (!MBB.isLiveIn(AArch64::LR))
9878 MBB.addLiveIn(AArch64::LR);
9879
9880 MachineBasicBlock::iterator It = MBB.begin();
9881 MachineBasicBlock::iterator Et = MBB.end();
9882
9883 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9884 OF.FrameConstructionID == MachineOutlinerThunk)
9885 Et = std::prev(MBB.end());
9886
9887 // Insert a save before the outlined region
9888 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9889 .addReg(AArch64::SP, RegState::Define)
9890 .addReg(AArch64::LR)
9891 .addReg(AArch64::SP)
9892 .addImm(-16);
9893 It = MBB.insert(It, STRXpre);
9894
9895 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9896 CFIInstBuilder CFIBuilder(MBB, It, MachineInstr::FrameSetup);
9897
9898 // Add a CFI saying the stack was moved 16 B down.
9899 CFIBuilder.buildDefCFAOffset(16);
9900
9901 // Add a CFI saying that the LR that we want to find is now 16 B higher
9902 // than before.
9903 CFIBuilder.buildOffset(AArch64::LR, -16);
9904 }
9905
9906 // Insert a restore before the terminator for the function.
9907 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9908 .addReg(AArch64::SP, RegState::Define)
9909 .addReg(AArch64::LR, RegState::Define)
9910 .addReg(AArch64::SP)
9911 .addImm(16);
9912 Et = MBB.insert(Et, LDRXpost);
9913 }
9914
9915 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9916
9917 // If this is a tail call outlined function, then there's already a return.
9918 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9919 OF.FrameConstructionID == MachineOutlinerThunk) {
9920 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9921 return;
9922 }
9923
9924 // It's not a tail call, so we have to insert the return ourselves.
9925
9926 // LR has to be a live in so that we can return to it.
9927 if (!MBB.isLiveIn(AArch64::LR))
9928 MBB.addLiveIn(AArch64::LR);
9929
9930 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9931 .addReg(AArch64::LR);
9932 MBB.insert(MBB.end(), ret);
9933
9934 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9935
9936 FI->setOutliningStyle("Function");
9937
9938 // Did we have to modify the stack by saving the link register?
9939 if (OF.FrameConstructionID != MachineOutlinerDefault)
9940 return;
9941
9942 // We modified the stack.
9943 // Walk over the basic block and fix up all the stack accesses.
9944 fixupPostOutline(MBB);
9945 }
9946
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9947 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9948 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9949 MachineFunction &MF, outliner::Candidate &C) const {
9950
9951 // Are we tail calling?
9952 if (C.CallConstructionID == MachineOutlinerTailCall) {
9953 // If yes, then we can just branch to the label.
9954 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9955 .addGlobalAddress(M.getNamedValue(MF.getName()))
9956 .addImm(0));
9957 return It;
9958 }
9959
9960 // Are we saving the link register?
9961 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9962 C.CallConstructionID == MachineOutlinerThunk) {
9963 // No, so just insert the call.
9964 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9965 .addGlobalAddress(M.getNamedValue(MF.getName())));
9966 return It;
9967 }
9968
9969 // We want to return the spot where we inserted the call.
9970 MachineBasicBlock::iterator CallPt;
9971
9972 // Instructions for saving and restoring LR around the call instruction we're
9973 // going to insert.
9974 MachineInstr *Save;
9975 MachineInstr *Restore;
9976 // Can we save to a register?
9977 if (C.CallConstructionID == MachineOutlinerRegSave) {
9978 // FIXME: This logic should be sunk into a target-specific interface so that
9979 // we don't have to recompute the register.
9980 Register Reg = findRegisterToSaveLRTo(C);
9981 assert(Reg && "No callee-saved register available?");
9982
9983 // LR has to be a live in so that we can save it.
9984 if (!MBB.isLiveIn(AArch64::LR))
9985 MBB.addLiveIn(AArch64::LR);
9986
9987 // Save and restore LR from Reg.
9988 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9989 .addReg(AArch64::XZR)
9990 .addReg(AArch64::LR)
9991 .addImm(0);
9992 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9993 .addReg(AArch64::XZR)
9994 .addReg(Reg)
9995 .addImm(0);
9996 } else {
9997 // We have the default case. Save and restore from SP.
9998 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9999 .addReg(AArch64::SP, RegState::Define)
10000 .addReg(AArch64::LR)
10001 .addReg(AArch64::SP)
10002 .addImm(-16);
10003 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
10004 .addReg(AArch64::SP, RegState::Define)
10005 .addReg(AArch64::LR, RegState::Define)
10006 .addReg(AArch64::SP)
10007 .addImm(16);
10008 }
10009
10010 It = MBB.insert(It, Save);
10011 It++;
10012
10013 // Insert the call.
10014 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
10015 .addGlobalAddress(M.getNamedValue(MF.getName())));
10016 CallPt = It;
10017 It++;
10018
10019 It = MBB.insert(It, Restore);
10020 return CallPt;
10021 }
10022
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const10023 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
10024 MachineFunction &MF) const {
10025 return MF.getFunction().hasMinSize();
10026 }
10027
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const10028 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10029 MachineBasicBlock::iterator Iter,
10030 DebugLoc &DL,
10031 bool AllowSideEffects) const {
10032 const MachineFunction &MF = *MBB.getParent();
10033 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
10034 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
10035
10036 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10037 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
10038 } else if (STI.isSVEorStreamingSVEAvailable()) {
10039 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
10040 .addImm(0)
10041 .addImm(0);
10042 } else if (STI.isNeonAvailable()) {
10043 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
10044 .addImm(0);
10045 } else {
10046 // This is a streaming-compatible function without SVE. We don't have full
10047 // Neon (just FPRs), so we can at most use the first 64-bit sub-register.
10048 // So given `movi v..` would be illegal use `fmov d..` instead.
10049 assert(STI.hasNEON() && "Expected to have NEON.");
10050 Register Reg64 = TRI.getSubReg(Reg, AArch64::dsub);
10051 BuildMI(MBB, Iter, DL, get(AArch64::FMOVD0), Reg64);
10052 }
10053 }
10054
10055 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const10056 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
10057
10058 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
10059 // and zero immediate operands used as an alias for mov instruction.
10060 if (((MI.getOpcode() == AArch64::ORRWrs &&
10061 MI.getOperand(1).getReg() == AArch64::WZR &&
10062 MI.getOperand(3).getImm() == 0x0) ||
10063 (MI.getOpcode() == AArch64::ORRWrr &&
10064 MI.getOperand(1).getReg() == AArch64::WZR)) &&
10065 // Check that the w->w move is not a zero-extending w->x mov.
10066 (!MI.getOperand(0).getReg().isVirtual() ||
10067 MI.getOperand(0).getSubReg() == 0) &&
10068 (!MI.getOperand(0).getReg().isPhysical() ||
10069 MI.findRegisterDefOperandIdx(getXRegFromWReg(MI.getOperand(0).getReg()),
10070 /*TRI=*/nullptr) == -1))
10071 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10072
10073 if (MI.getOpcode() == AArch64::ORRXrs &&
10074 MI.getOperand(1).getReg() == AArch64::XZR &&
10075 MI.getOperand(3).getImm() == 0x0)
10076 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10077
10078 return std::nullopt;
10079 }
10080
10081 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const10082 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
10083 if ((MI.getOpcode() == AArch64::ORRWrs &&
10084 MI.getOperand(1).getReg() == AArch64::WZR &&
10085 MI.getOperand(3).getImm() == 0x0) ||
10086 (MI.getOpcode() == AArch64::ORRWrr &&
10087 MI.getOperand(1).getReg() == AArch64::WZR))
10088 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
10089 return std::nullopt;
10090 }
10091
10092 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const10093 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
10094 int Sign = 1;
10095 int64_t Offset = 0;
10096
10097 // TODO: Handle cases where Reg is a super- or sub-register of the
10098 // destination register.
10099 const MachineOperand &Op0 = MI.getOperand(0);
10100 if (!Op0.isReg() || Reg != Op0.getReg())
10101 return std::nullopt;
10102
10103 switch (MI.getOpcode()) {
10104 default:
10105 return std::nullopt;
10106 case AArch64::SUBWri:
10107 case AArch64::SUBXri:
10108 case AArch64::SUBSWri:
10109 case AArch64::SUBSXri:
10110 Sign *= -1;
10111 [[fallthrough]];
10112 case AArch64::ADDSWri:
10113 case AArch64::ADDSXri:
10114 case AArch64::ADDWri:
10115 case AArch64::ADDXri: {
10116 // TODO: Third operand can be global address (usually some string).
10117 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
10118 !MI.getOperand(2).isImm())
10119 return std::nullopt;
10120 int Shift = MI.getOperand(3).getImm();
10121 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
10122 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
10123 }
10124 }
10125 return RegImmPair{MI.getOperand(1).getReg(), Offset};
10126 }
10127
10128 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
10129 /// the destination register then, if possible, describe the value in terms of
10130 /// the source register.
10131 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)10132 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
10133 const TargetInstrInfo *TII,
10134 const TargetRegisterInfo *TRI) {
10135 auto DestSrc = TII->isCopyLikeInstr(MI);
10136 if (!DestSrc)
10137 return std::nullopt;
10138
10139 Register DestReg = DestSrc->Destination->getReg();
10140 Register SrcReg = DestSrc->Source->getReg();
10141
10142 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10143
10144 // If the described register is the destination, just return the source.
10145 if (DestReg == DescribedReg)
10146 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10147
10148 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
10149 if (MI.getOpcode() == AArch64::ORRWrs &&
10150 TRI->isSuperRegister(DestReg, DescribedReg))
10151 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10152
10153 // We may need to describe the lower part of a ORRXrs move.
10154 if (MI.getOpcode() == AArch64::ORRXrs &&
10155 TRI->isSubRegister(DestReg, DescribedReg)) {
10156 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
10157 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10158 }
10159
10160 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
10161 "Unhandled ORR[XW]rs copy case");
10162
10163 return std::nullopt;
10164 }
10165
isFunctionSafeToSplit(const MachineFunction & MF) const10166 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
10167 // Functions cannot be split to different sections on AArch64 if they have
10168 // a red zone. This is because relaxing a cross-section branch may require
10169 // incrementing the stack pointer to spill a register, which would overwrite
10170 // the red zone.
10171 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
10172 return false;
10173
10174 return TargetInstrInfo::isFunctionSafeToSplit(MF);
10175 }
10176
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const10177 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
10178 const MachineBasicBlock &MBB) const {
10179 // Asm Goto blocks can contain conditional branches to goto labels, which can
10180 // get moved out of range of the branch instruction.
10181 auto isAsmGoto = [](const MachineInstr &MI) {
10182 return MI.getOpcode() == AArch64::INLINEASM_BR;
10183 };
10184 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
10185 return false;
10186
10187 // Because jump tables are label-relative instead of table-relative, they all
10188 // must be in the same section or relocation fixup handling will fail.
10189
10190 // Check if MBB is a jump table target
10191 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
10192 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
10193 return llvm::is_contained(JTE.MBBs, &MBB);
10194 };
10195 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
10196 return false;
10197
10198 // Check if MBB contains a jump table lookup
10199 for (const MachineInstr &MI : MBB) {
10200 switch (MI.getOpcode()) {
10201 case TargetOpcode::G_BRJT:
10202 case AArch64::JumpTableDest32:
10203 case AArch64::JumpTableDest16:
10204 case AArch64::JumpTableDest8:
10205 return false;
10206 default:
10207 continue;
10208 }
10209 }
10210
10211 // MBB isn't a special case, so it's safe to be split to the cold section.
10212 return true;
10213 }
10214
10215 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const10216 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
10217 Register Reg) const {
10218 const MachineFunction *MF = MI.getMF();
10219 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
10220 switch (MI.getOpcode()) {
10221 case AArch64::MOVZWi:
10222 case AArch64::MOVZXi: {
10223 // MOVZWi may be used for producing zero-extended 32-bit immediates in
10224 // 64-bit parameters, so we need to consider super-registers.
10225 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10226 return std::nullopt;
10227
10228 if (!MI.getOperand(1).isImm())
10229 return std::nullopt;
10230 int64_t Immediate = MI.getOperand(1).getImm();
10231 int Shift = MI.getOperand(2).getImm();
10232 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
10233 nullptr);
10234 }
10235 case AArch64::ORRWrs:
10236 case AArch64::ORRXrs:
10237 return describeORRLoadedValue(MI, Reg, this, TRI);
10238 }
10239
10240 return TargetInstrInfo::describeLoadedValue(MI, Reg);
10241 }
10242
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const10243 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
10244 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
10245 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
10246 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
10247 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
10248
10249 // Anyexts are nops.
10250 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
10251 return true;
10252
10253 Register DefReg = ExtMI.getOperand(0).getReg();
10254 if (!MRI.hasOneNonDBGUse(DefReg))
10255 return false;
10256
10257 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
10258 // addressing mode.
10259 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
10260 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
10261 }
10262
getElementSizeForOpcode(unsigned Opc) const10263 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
10264 return get(Opc).TSFlags & AArch64::ElementSizeMask;
10265 }
10266
isPTestLikeOpcode(unsigned Opc) const10267 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
10268 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
10269 }
10270
isWhileOpcode(unsigned Opc) const10271 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
10272 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
10273 }
10274
10275 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const10276 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
10277 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
10278 }
10279
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const10280 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
10281 unsigned Scale) const {
10282 if (Offset && Scale)
10283 return false;
10284
10285 // Check Reg + Imm
10286 if (!Scale) {
10287 // 9-bit signed offset
10288 if (isInt<9>(Offset))
10289 return true;
10290
10291 // 12-bit unsigned offset
10292 unsigned Shift = Log2_64(NumBytes);
10293 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
10294 // Must be a multiple of NumBytes (NumBytes is a power of 2)
10295 (Offset >> Shift) << Shift == Offset)
10296 return true;
10297 return false;
10298 }
10299
10300 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
10301 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
10302 }
10303
getBLRCallOpcode(const MachineFunction & MF)10304 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
10305 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
10306 return AArch64::BLRNoIP;
10307 else
10308 return AArch64::BLR;
10309 }
10310
10311 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const10312 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
10313 Register TargetReg, bool FrameSetup) const {
10314 assert(TargetReg != AArch64::SP && "New top of stack cannot already be in SP");
10315
10316 MachineBasicBlock &MBB = *MBBI->getParent();
10317 MachineFunction &MF = *MBB.getParent();
10318 const AArch64InstrInfo *TII =
10319 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
10320 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
10321 DebugLoc DL = MBB.findDebugLoc(MBBI);
10322
10323 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
10324 MachineBasicBlock *LoopTestMBB =
10325 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10326 MF.insert(MBBInsertPoint, LoopTestMBB);
10327 MachineBasicBlock *LoopBodyMBB =
10328 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10329 MF.insert(MBBInsertPoint, LoopBodyMBB);
10330 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
10331 MF.insert(MBBInsertPoint, ExitMBB);
10332 MachineInstr::MIFlag Flags =
10333 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
10334
10335 // LoopTest:
10336 // SUB SP, SP, #ProbeSize
10337 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
10338 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
10339
10340 // CMP SP, TargetReg
10341 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
10342 AArch64::XZR)
10343 .addReg(AArch64::SP)
10344 .addReg(TargetReg)
10345 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
10346 .setMIFlags(Flags);
10347
10348 // B.<Cond> LoopExit
10349 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
10350 .addImm(AArch64CC::LE)
10351 .addMBB(ExitMBB)
10352 .setMIFlags(Flags);
10353
10354 // STR XZR, [SP]
10355 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
10356 .addReg(AArch64::XZR)
10357 .addReg(AArch64::SP)
10358 .addImm(0)
10359 .setMIFlags(Flags);
10360
10361 // B loop
10362 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
10363 .addMBB(LoopTestMBB)
10364 .setMIFlags(Flags);
10365
10366 // LoopExit:
10367 // MOV SP, TargetReg
10368 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
10369 .addReg(TargetReg)
10370 .addImm(0)
10371 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
10372 .setMIFlags(Flags);
10373
10374 // LDR XZR, [SP]
10375 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
10376 .addReg(AArch64::XZR, RegState::Define)
10377 .addReg(AArch64::SP)
10378 .addImm(0)
10379 .setMIFlags(Flags);
10380
10381 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
10382 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
10383
10384 LoopTestMBB->addSuccessor(ExitMBB);
10385 LoopTestMBB->addSuccessor(LoopBodyMBB);
10386 LoopBodyMBB->addSuccessor(LoopTestMBB);
10387 MBB.addSuccessor(LoopTestMBB);
10388
10389 // Update liveins.
10390 if (MF.getRegInfo().reservedRegsFrozen())
10391 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
10392
10393 return ExitMBB->begin();
10394 }
10395
10396 namespace {
10397 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
10398 MachineFunction *MF;
10399 const TargetInstrInfo *TII;
10400 const TargetRegisterInfo *TRI;
10401 MachineRegisterInfo &MRI;
10402
10403 /// The block of the loop
10404 MachineBasicBlock *LoopBB;
10405 /// The conditional branch of the loop
10406 MachineInstr *CondBranch;
10407 /// The compare instruction for loop control
10408 MachineInstr *Comp;
10409 /// The number of the operand of the loop counter value in Comp
10410 unsigned CompCounterOprNum;
10411 /// The instruction that updates the loop counter value
10412 MachineInstr *Update;
10413 /// The number of the operand of the loop counter value in Update
10414 unsigned UpdateCounterOprNum;
10415 /// The initial value of the loop counter
10416 Register Init;
10417 /// True iff Update is a predecessor of Comp
10418 bool IsUpdatePriorComp;
10419
10420 /// The normalized condition used by createTripCountGreaterCondition()
10421 SmallVector<MachineOperand, 4> Cond;
10422
10423 public:
AArch64PipelinerLoopInfo(MachineBasicBlock * LoopBB,MachineInstr * CondBranch,MachineInstr * Comp,unsigned CompCounterOprNum,MachineInstr * Update,unsigned UpdateCounterOprNum,Register Init,bool IsUpdatePriorComp,const SmallVectorImpl<MachineOperand> & Cond)10424 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
10425 MachineInstr *Comp, unsigned CompCounterOprNum,
10426 MachineInstr *Update, unsigned UpdateCounterOprNum,
10427 Register Init, bool IsUpdatePriorComp,
10428 const SmallVectorImpl<MachineOperand> &Cond)
10429 : MF(Comp->getParent()->getParent()),
10430 TII(MF->getSubtarget().getInstrInfo()),
10431 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
10432 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
10433 CompCounterOprNum(CompCounterOprNum), Update(Update),
10434 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
10435 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
10436
shouldIgnoreForPipelining(const MachineInstr * MI) const10437 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
10438 // Make the instructions for loop control be placed in stage 0.
10439 // The predecessors of Comp are considered by the caller.
10440 return MI == Comp;
10441 }
10442
createTripCountGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & CondParam)10443 std::optional<bool> createTripCountGreaterCondition(
10444 int TC, MachineBasicBlock &MBB,
10445 SmallVectorImpl<MachineOperand> &CondParam) override {
10446 // A branch instruction will be inserted as "if (Cond) goto epilogue".
10447 // Cond is normalized for such use.
10448 // The predecessors of the branch are assumed to have already been inserted.
10449 CondParam = Cond;
10450 return {};
10451 }
10452
10453 void createRemainingIterationsGreaterCondition(
10454 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10455 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
10456
setPreheader(MachineBasicBlock * NewPreheader)10457 void setPreheader(MachineBasicBlock *NewPreheader) override {}
10458
adjustTripCount(int TripCountAdjust)10459 void adjustTripCount(int TripCountAdjust) override {}
10460
isMVEExpanderSupported()10461 bool isMVEExpanderSupported() override { return true; }
10462 };
10463 } // namespace
10464
10465 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
10466 /// is replaced by ReplaceReg. The output register is newly created.
10467 /// The other operands are unchanged from MI.
cloneInstr(const MachineInstr * MI,unsigned ReplaceOprNum,Register ReplaceReg,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertTo)10468 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
10469 Register ReplaceReg, MachineBasicBlock &MBB,
10470 MachineBasicBlock::iterator InsertTo) {
10471 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
10472 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
10473 const TargetRegisterInfo *TRI =
10474 MBB.getParent()->getSubtarget().getRegisterInfo();
10475 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
10476 Register Result = 0;
10477 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
10478 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
10479 Result = MRI.createVirtualRegister(
10480 MRI.getRegClass(NewMI->getOperand(0).getReg()));
10481 NewMI->getOperand(I).setReg(Result);
10482 } else if (I == ReplaceOprNum) {
10483 MRI.constrainRegClass(
10484 ReplaceReg,
10485 TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
10486 NewMI->getOperand(I).setReg(ReplaceReg);
10487 }
10488 }
10489 MBB.insert(InsertTo, NewMI);
10490 return Result;
10491 }
10492
createRemainingIterationsGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & Cond,DenseMap<MachineInstr *,MachineInstr * > & LastStage0Insts)10493 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
10494 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
10495 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
10496 // Create and accumulate conditions for next TC iterations.
10497 // Example:
10498 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
10499 // # iteration of the kernel
10500 //
10501 // # insert the following instructions
10502 // cond = CSINCXr 0, 0, C, implicit $nzcv
10503 // counter = ADDXri counter, 1 # clone from this->Update
10504 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
10505 // cond = CSINCXr cond, cond, C, implicit $nzcv
10506 // ... (repeat TC times)
10507 // SUBSXri cond, 0, implicit-def $nzcv
10508
10509 assert(CondBranch->getOpcode() == AArch64::Bcc);
10510 // CondCode to exit the loop
10511 AArch64CC::CondCode CC =
10512 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
10513 if (CondBranch->getOperand(1).getMBB() == LoopBB)
10514 CC = AArch64CC::getInvertedCondCode(CC);
10515
10516 // Accumulate conditions to exit the loop
10517 Register AccCond = AArch64::XZR;
10518
10519 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
10520 auto AccumulateCond = [&](Register CurCond,
10521 AArch64CC::CondCode CC) -> Register {
10522 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
10523 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
10524 .addReg(NewCond, RegState::Define)
10525 .addReg(CurCond)
10526 .addReg(CurCond)
10527 .addImm(AArch64CC::getInvertedCondCode(CC));
10528 return NewCond;
10529 };
10530
10531 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
10532 // Update and Comp for I==0 are already exists in MBB
10533 // (MBB is an unrolled kernel)
10534 Register Counter;
10535 for (int I = 0; I <= TC; ++I) {
10536 Register NextCounter;
10537 if (I != 0)
10538 NextCounter =
10539 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10540
10541 AccCond = AccumulateCond(AccCond, CC);
10542
10543 if (I != TC) {
10544 if (I == 0) {
10545 if (Update != Comp && IsUpdatePriorComp) {
10546 Counter =
10547 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10548 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
10549 MBB.end());
10550 } else {
10551 // can use already calculated value
10552 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
10553 }
10554 } else if (Update != Comp) {
10555 NextCounter =
10556 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10557 }
10558 }
10559 Counter = NextCounter;
10560 }
10561 } else {
10562 Register Counter;
10563 if (LastStage0Insts.empty()) {
10564 // use initial counter value (testing if the trip count is sufficient to
10565 // be executed by pipelined code)
10566 Counter = Init;
10567 if (IsUpdatePriorComp)
10568 Counter =
10569 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10570 } else {
10571 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
10572 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
10573 }
10574
10575 for (int I = 0; I <= TC; ++I) {
10576 Register NextCounter;
10577 NextCounter =
10578 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
10579 AccCond = AccumulateCond(AccCond, CC);
10580 if (I != TC && Update != Comp)
10581 NextCounter =
10582 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
10583 Counter = NextCounter;
10584 }
10585 }
10586
10587 // If AccCond == 0, the remainder is greater than TC.
10588 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
10589 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
10590 .addReg(AccCond)
10591 .addImm(0)
10592 .addImm(0);
10593 Cond.clear();
10594 Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
10595 }
10596
extractPhiReg(const MachineInstr & Phi,const MachineBasicBlock * MBB,Register & RegMBB,Register & RegOther)10597 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
10598 Register &RegMBB, Register &RegOther) {
10599 assert(Phi.getNumOperands() == 5);
10600 if (Phi.getOperand(2).getMBB() == MBB) {
10601 RegMBB = Phi.getOperand(1).getReg();
10602 RegOther = Phi.getOperand(3).getReg();
10603 } else {
10604 assert(Phi.getOperand(4).getMBB() == MBB);
10605 RegMBB = Phi.getOperand(3).getReg();
10606 RegOther = Phi.getOperand(1).getReg();
10607 }
10608 }
10609
isDefinedOutside(Register Reg,const MachineBasicBlock * BB)10610 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
10611 if (!Reg.isVirtual())
10612 return false;
10613 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
10614 return MRI.getVRegDef(Reg)->getParent() != BB;
10615 }
10616
10617 /// If Reg is an induction variable, return true and set some parameters
getIndVarInfo(Register Reg,const MachineBasicBlock * LoopBB,MachineInstr * & UpdateInst,unsigned & UpdateCounterOprNum,Register & InitReg,bool & IsUpdatePriorComp)10618 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
10619 MachineInstr *&UpdateInst,
10620 unsigned &UpdateCounterOprNum, Register &InitReg,
10621 bool &IsUpdatePriorComp) {
10622 // Example:
10623 //
10624 // Preheader:
10625 // InitReg = ...
10626 // LoopBB:
10627 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
10628 // Reg = COPY Reg0 ; COPY is ignored.
10629 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
10630 // ; Reg is the value calculated in the previous
10631 // ; iteration, so IsUpdatePriorComp == false.
10632
10633 if (LoopBB->pred_size() != 2)
10634 return false;
10635 if (!Reg.isVirtual())
10636 return false;
10637 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
10638 UpdateInst = nullptr;
10639 UpdateCounterOprNum = 0;
10640 InitReg = 0;
10641 IsUpdatePriorComp = true;
10642 Register CurReg = Reg;
10643 while (true) {
10644 MachineInstr *Def = MRI.getVRegDef(CurReg);
10645 if (Def->getParent() != LoopBB)
10646 return false;
10647 if (Def->isCopy()) {
10648 // Ignore copy instructions unless they contain subregisters
10649 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
10650 return false;
10651 CurReg = Def->getOperand(1).getReg();
10652 } else if (Def->isPHI()) {
10653 if (InitReg != 0)
10654 return false;
10655 if (!UpdateInst)
10656 IsUpdatePriorComp = false;
10657 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
10658 } else {
10659 if (UpdateInst)
10660 return false;
10661 switch (Def->getOpcode()) {
10662 case AArch64::ADDSXri:
10663 case AArch64::ADDSWri:
10664 case AArch64::SUBSXri:
10665 case AArch64::SUBSWri:
10666 case AArch64::ADDXri:
10667 case AArch64::ADDWri:
10668 case AArch64::SUBXri:
10669 case AArch64::SUBWri:
10670 UpdateInst = Def;
10671 UpdateCounterOprNum = 1;
10672 break;
10673 case AArch64::ADDSXrr:
10674 case AArch64::ADDSWrr:
10675 case AArch64::SUBSXrr:
10676 case AArch64::SUBSWrr:
10677 case AArch64::ADDXrr:
10678 case AArch64::ADDWrr:
10679 case AArch64::SUBXrr:
10680 case AArch64::SUBWrr:
10681 UpdateInst = Def;
10682 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
10683 UpdateCounterOprNum = 1;
10684 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
10685 UpdateCounterOprNum = 2;
10686 else
10687 return false;
10688 break;
10689 default:
10690 return false;
10691 }
10692 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
10693 }
10694
10695 if (!CurReg.isVirtual())
10696 return false;
10697 if (Reg == CurReg)
10698 break;
10699 }
10700
10701 if (!UpdateInst)
10702 return false;
10703
10704 return true;
10705 }
10706
10707 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock * LoopBB) const10708 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
10709 // Accept loops that meet the following conditions
10710 // * The conditional branch is BCC
10711 // * The compare instruction is ADDS/SUBS/WHILEXX
10712 // * One operand of the compare is an induction variable and the other is a
10713 // loop invariant value
10714 // * The induction variable is incremented/decremented by a single instruction
10715 // * Does not contain CALL or instructions which have unmodeled side effects
10716
10717 for (MachineInstr &MI : *LoopBB)
10718 if (MI.isCall() || MI.hasUnmodeledSideEffects())
10719 // This instruction may use NZCV, which interferes with the instruction to
10720 // be inserted for loop control.
10721 return nullptr;
10722
10723 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
10724 SmallVector<MachineOperand, 4> Cond;
10725 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
10726 return nullptr;
10727
10728 // Infinite loops are not supported
10729 if (TBB == LoopBB && FBB == LoopBB)
10730 return nullptr;
10731
10732 // Must be conditional branch
10733 if (TBB != LoopBB && FBB == nullptr)
10734 return nullptr;
10735
10736 assert((TBB == LoopBB || FBB == LoopBB) &&
10737 "The Loop must be a single-basic-block loop");
10738
10739 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
10740 const TargetRegisterInfo &TRI = getRegisterInfo();
10741
10742 if (CondBranch->getOpcode() != AArch64::Bcc)
10743 return nullptr;
10744
10745 // Normalization for createTripCountGreaterCondition()
10746 if (TBB == LoopBB)
10747 reverseBranchCondition(Cond);
10748
10749 MachineInstr *Comp = nullptr;
10750 unsigned CompCounterOprNum = 0;
10751 for (MachineInstr &MI : reverse(*LoopBB)) {
10752 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
10753 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
10754 // operands is a loop invariant value
10755
10756 switch (MI.getOpcode()) {
10757 case AArch64::SUBSXri:
10758 case AArch64::SUBSWri:
10759 case AArch64::ADDSXri:
10760 case AArch64::ADDSWri:
10761 Comp = &MI;
10762 CompCounterOprNum = 1;
10763 break;
10764 case AArch64::ADDSWrr:
10765 case AArch64::ADDSXrr:
10766 case AArch64::SUBSWrr:
10767 case AArch64::SUBSXrr:
10768 Comp = &MI;
10769 break;
10770 default:
10771 if (isWhileOpcode(MI.getOpcode())) {
10772 Comp = &MI;
10773 break;
10774 }
10775 return nullptr;
10776 }
10777
10778 if (CompCounterOprNum == 0) {
10779 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10780 CompCounterOprNum = 2;
10781 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10782 CompCounterOprNum = 1;
10783 else
10784 return nullptr;
10785 }
10786 break;
10787 }
10788 }
10789 if (!Comp)
10790 return nullptr;
10791
10792 MachineInstr *Update = nullptr;
10793 Register Init;
10794 bool IsUpdatePriorComp;
10795 unsigned UpdateCounterOprNum;
10796 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10797 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10798 return nullptr;
10799
10800 return std::make_unique<AArch64PipelinerLoopInfo>(
10801 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10802 Init, IsUpdatePriorComp, Cond);
10803 }
10804
10805 /// verifyInstruction - Perform target specific instruction verification.
verifyInstruction(const MachineInstr & MI,StringRef & ErrInfo) const10806 bool AArch64InstrInfo::verifyInstruction(const MachineInstr &MI,
10807 StringRef &ErrInfo) const {
10808
10809 // Verify that immediate offsets on load/store instructions are within range.
10810 // Stack objects with an FI operand are excluded as they can be fixed up
10811 // during PEI.
10812 TypeSize Scale(0U, false), Width(0U, false);
10813 int64_t MinOffset, MaxOffset;
10814 if (getMemOpInfo(MI.getOpcode(), Scale, Width, MinOffset, MaxOffset)) {
10815 unsigned ImmIdx = getLoadStoreImmIdx(MI.getOpcode());
10816 if (MI.getOperand(ImmIdx).isImm() && !MI.getOperand(ImmIdx - 1).isFI()) {
10817 int64_t Imm = MI.getOperand(ImmIdx).getImm();
10818 if (Imm < MinOffset || Imm > MaxOffset) {
10819 ErrInfo = "Unexpected immediate on load/store instruction";
10820 return false;
10821 }
10822 }
10823 }
10824 return true;
10825 }
10826
10827 #define GET_INSTRINFO_HELPERS
10828 #define GET_INSTRMAP_INFO
10829 #include "AArch64GenInstrInfo.inc"
10830