1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "AArch64InstrInfo.h"
14 #include "AArch64ExpandImm.h"
15 #include "AArch64FrameLowering.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PointerAuth.h"
18 #include "AArch64Subtarget.h"
19 #include "MCTargetDesc/AArch64AddressingModes.h"
20 #include "MCTargetDesc/AArch64MCTargetDesc.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/ArrayRef.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/CodeGen/LivePhysRegs.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineCombinerPattern.h"
28 #include "llvm/CodeGen/MachineFrameInfo.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineMemOperand.h"
33 #include "llvm/CodeGen/MachineModuleInfo.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/RegisterScavenging.h"
37 #include "llvm/CodeGen/StackMaps.h"
38 #include "llvm/CodeGen/TargetRegisterInfo.h"
39 #include "llvm/CodeGen/TargetSubtargetInfo.h"
40 #include "llvm/IR/DebugInfoMetadata.h"
41 #include "llvm/IR/DebugLoc.h"
42 #include "llvm/IR/GlobalValue.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/MC/MCAsmInfo.h"
45 #include "llvm/MC/MCInst.h"
46 #include "llvm/MC/MCInstBuilder.h"
47 #include "llvm/MC/MCInstrDesc.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CodeGen.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/LEB128.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOptions.h"
56 #include <cassert>
57 #include <cstdint>
58 #include <iterator>
59 #include <utility>
60
61 using namespace llvm;
62
63 #define GET_INSTRINFO_CTOR_DTOR
64 #include "AArch64GenInstrInfo.inc"
65
66 static cl::opt<unsigned> TBZDisplacementBits(
67 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
68 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
69
70 static cl::opt<unsigned> CBZDisplacementBits(
71 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
72 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
73
74 static cl::opt<unsigned>
75 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
76 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
77
78 static cl::opt<unsigned>
79 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26),
80 cl::desc("Restrict range of B instructions (DEBUG)"));
81
AArch64InstrInfo(const AArch64Subtarget & STI)82 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
83 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
84 AArch64::CATCHRET),
85 RI(STI.getTargetTriple()), Subtarget(STI) {}
86
87 /// GetInstSize - Return the number of bytes of code the specified
88 /// instruction may be. This returns the maximum number of bytes.
getInstSizeInBytes(const MachineInstr & MI) const89 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
90 const MachineBasicBlock &MBB = *MI.getParent();
91 const MachineFunction *MF = MBB.getParent();
92 const Function &F = MF->getFunction();
93 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
94
95 {
96 auto Op = MI.getOpcode();
97 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
98 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
99 }
100
101 // Meta-instructions emit no code.
102 if (MI.isMetaInstruction())
103 return 0;
104
105 // FIXME: We currently only handle pseudoinstructions that don't get expanded
106 // before the assembly printer.
107 unsigned NumBytes = 0;
108 const MCInstrDesc &Desc = MI.getDesc();
109
110 // Size should be preferably set in
111 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
112 // Specific cases handle instructions of variable sizes
113 switch (Desc.getOpcode()) {
114 default:
115 if (Desc.getSize())
116 return Desc.getSize();
117
118 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
119 // with fixed constant size but not specified in .td file) is a normal
120 // 4-byte insn.
121 NumBytes = 4;
122 break;
123 case TargetOpcode::STACKMAP:
124 // The upper bound for a stackmap intrinsic is the full length of its shadow
125 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
126 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
127 break;
128 case TargetOpcode::PATCHPOINT:
129 // The size of the patchpoint intrinsic is the number of bytes requested
130 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
131 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
132 break;
133 case TargetOpcode::STATEPOINT:
134 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
135 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
136 // No patch bytes means a normal call inst is emitted
137 if (NumBytes == 0)
138 NumBytes = 4;
139 break;
140 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
141 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
142 // instructions are expanded to the specified number of NOPs. Otherwise,
143 // they are expanded to 36-byte XRay sleds.
144 NumBytes =
145 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4;
146 break;
147 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
148 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
149 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
150 NumBytes = 36;
151 break;
152 case TargetOpcode::PATCHABLE_EVENT_CALL:
153 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
154 NumBytes = 24;
155 break;
156
157 case AArch64::SPACE:
158 NumBytes = MI.getOperand(1).getImm();
159 break;
160 case TargetOpcode::BUNDLE:
161 NumBytes = getInstBundleLength(MI);
162 break;
163 }
164
165 return NumBytes;
166 }
167
getInstBundleLength(const MachineInstr & MI) const168 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
169 unsigned Size = 0;
170 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
171 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
172 while (++I != E && I->isInsideBundle()) {
173 assert(!I->isBundle() && "No nested bundle!");
174 Size += getInstSizeInBytes(*I);
175 }
176 return Size;
177 }
178
parseCondBranch(MachineInstr * LastInst,MachineBasicBlock * & Target,SmallVectorImpl<MachineOperand> & Cond)179 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
180 SmallVectorImpl<MachineOperand> &Cond) {
181 // Block ends with fall-through condbranch.
182 switch (LastInst->getOpcode()) {
183 default:
184 llvm_unreachable("Unknown branch instruction?");
185 case AArch64::Bcc:
186 Target = LastInst->getOperand(1).getMBB();
187 Cond.push_back(LastInst->getOperand(0));
188 break;
189 case AArch64::CBZW:
190 case AArch64::CBZX:
191 case AArch64::CBNZW:
192 case AArch64::CBNZX:
193 Target = LastInst->getOperand(1).getMBB();
194 Cond.push_back(MachineOperand::CreateImm(-1));
195 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
196 Cond.push_back(LastInst->getOperand(0));
197 break;
198 case AArch64::TBZW:
199 case AArch64::TBZX:
200 case AArch64::TBNZW:
201 case AArch64::TBNZX:
202 Target = LastInst->getOperand(2).getMBB();
203 Cond.push_back(MachineOperand::CreateImm(-1));
204 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
205 Cond.push_back(LastInst->getOperand(0));
206 Cond.push_back(LastInst->getOperand(1));
207 }
208 }
209
getBranchDisplacementBits(unsigned Opc)210 static unsigned getBranchDisplacementBits(unsigned Opc) {
211 switch (Opc) {
212 default:
213 llvm_unreachable("unexpected opcode!");
214 case AArch64::B:
215 return BDisplacementBits;
216 case AArch64::TBNZW:
217 case AArch64::TBZW:
218 case AArch64::TBNZX:
219 case AArch64::TBZX:
220 return TBZDisplacementBits;
221 case AArch64::CBNZW:
222 case AArch64::CBZW:
223 case AArch64::CBNZX:
224 case AArch64::CBZX:
225 return CBZDisplacementBits;
226 case AArch64::Bcc:
227 return BCCDisplacementBits;
228 }
229 }
230
isBranchOffsetInRange(unsigned BranchOp,int64_t BrOffset) const231 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
232 int64_t BrOffset) const {
233 unsigned Bits = getBranchDisplacementBits(BranchOp);
234 assert(Bits >= 3 && "max branch displacement must be enough to jump"
235 "over conditional branch expansion");
236 return isIntN(Bits, BrOffset / 4);
237 }
238
239 MachineBasicBlock *
getBranchDestBlock(const MachineInstr & MI) const240 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
241 switch (MI.getOpcode()) {
242 default:
243 llvm_unreachable("unexpected opcode!");
244 case AArch64::B:
245 return MI.getOperand(0).getMBB();
246 case AArch64::TBZW:
247 case AArch64::TBNZW:
248 case AArch64::TBZX:
249 case AArch64::TBNZX:
250 return MI.getOperand(2).getMBB();
251 case AArch64::CBZW:
252 case AArch64::CBNZW:
253 case AArch64::CBZX:
254 case AArch64::CBNZX:
255 case AArch64::Bcc:
256 return MI.getOperand(1).getMBB();
257 }
258 }
259
insertIndirectBranch(MachineBasicBlock & MBB,MachineBasicBlock & NewDestBB,MachineBasicBlock & RestoreBB,const DebugLoc & DL,int64_t BrOffset,RegScavenger * RS) const260 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
261 MachineBasicBlock &NewDestBB,
262 MachineBasicBlock &RestoreBB,
263 const DebugLoc &DL,
264 int64_t BrOffset,
265 RegScavenger *RS) const {
266 assert(RS && "RegScavenger required for long branching");
267 assert(MBB.empty() &&
268 "new block should be inserted for expanding unconditional branch");
269 assert(MBB.pred_size() == 1);
270 assert(RestoreBB.empty() &&
271 "restore block should be inserted for restoring clobbered registers");
272
273 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
274 // Offsets outside of the signed 33-bit range are not supported for ADRP +
275 // ADD.
276 if (!isInt<33>(BrOffset))
277 report_fatal_error(
278 "Branch offsets outside of the signed 33-bit range not supported");
279
280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
281 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
282 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
283 .addReg(Reg)
284 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
285 .addImm(0);
286 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
287 };
288
289 RS->enterBasicBlockEnd(MBB);
290 // If X16 is unused, we can rely on the linker to insert a range extension
291 // thunk if NewDestBB is out of range of a single B instruction.
292 constexpr Register Reg = AArch64::X16;
293 if (!RS->isRegUsed(Reg)) {
294 insertUnconditionalBranch(MBB, &NewDestBB, DL);
295 RS->setRegUsed(Reg);
296 return;
297 }
298
299 // If there's a free register and it's worth inflating the code size,
300 // manually insert the indirect branch.
301 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass);
302 if (Scavenged != AArch64::NoRegister &&
303 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
304 buildIndirectBranch(Scavenged, NewDestBB);
305 RS->setRegUsed(Scavenged);
306 return;
307 }
308
309 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
310 // with red zones.
311 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
312 if (!AFI || AFI->hasRedZone().value_or(true))
313 report_fatal_error(
314 "Unable to insert indirect branch inside function that has red zone");
315
316 // Otherwise, spill X16 and defer range extension to the linker.
317 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
318 .addReg(AArch64::SP, RegState::Define)
319 .addReg(Reg)
320 .addReg(AArch64::SP)
321 .addImm(-16);
322
323 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
324
325 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
326 .addReg(AArch64::SP, RegState::Define)
327 .addReg(Reg, RegState::Define)
328 .addReg(AArch64::SP)
329 .addImm(16);
330 }
331
332 // Branch analysis.
analyzeBranch(MachineBasicBlock & MBB,MachineBasicBlock * & TBB,MachineBasicBlock * & FBB,SmallVectorImpl<MachineOperand> & Cond,bool AllowModify) const333 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
334 MachineBasicBlock *&TBB,
335 MachineBasicBlock *&FBB,
336 SmallVectorImpl<MachineOperand> &Cond,
337 bool AllowModify) const {
338 // If the block has no terminators, it just falls into the block after it.
339 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
340 if (I == MBB.end())
341 return false;
342
343 // Skip over SpeculationBarrierEndBB terminators
344 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
345 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
346 --I;
347 }
348
349 if (!isUnpredicatedTerminator(*I))
350 return false;
351
352 // Get the last instruction in the block.
353 MachineInstr *LastInst = &*I;
354
355 // If there is only one terminator instruction, process it.
356 unsigned LastOpc = LastInst->getOpcode();
357 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
358 if (isUncondBranchOpcode(LastOpc)) {
359 TBB = LastInst->getOperand(0).getMBB();
360 return false;
361 }
362 if (isCondBranchOpcode(LastOpc)) {
363 // Block ends with fall-through condbranch.
364 parseCondBranch(LastInst, TBB, Cond);
365 return false;
366 }
367 return true; // Can't handle indirect branch.
368 }
369
370 // Get the instruction before it if it is a terminator.
371 MachineInstr *SecondLastInst = &*I;
372 unsigned SecondLastOpc = SecondLastInst->getOpcode();
373
374 // If AllowModify is true and the block ends with two or more unconditional
375 // branches, delete all but the first unconditional branch.
376 if (AllowModify && isUncondBranchOpcode(LastOpc)) {
377 while (isUncondBranchOpcode(SecondLastOpc)) {
378 LastInst->eraseFromParent();
379 LastInst = SecondLastInst;
380 LastOpc = LastInst->getOpcode();
381 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
382 // Return now the only terminator is an unconditional branch.
383 TBB = LastInst->getOperand(0).getMBB();
384 return false;
385 }
386 SecondLastInst = &*I;
387 SecondLastOpc = SecondLastInst->getOpcode();
388 }
389 }
390
391 // If we're allowed to modify and the block ends in a unconditional branch
392 // which could simply fallthrough, remove the branch. (Note: This case only
393 // matters when we can't understand the whole sequence, otherwise it's also
394 // handled by BranchFolding.cpp.)
395 if (AllowModify && isUncondBranchOpcode(LastOpc) &&
396 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) {
397 LastInst->eraseFromParent();
398 LastInst = SecondLastInst;
399 LastOpc = LastInst->getOpcode();
400 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
401 assert(!isUncondBranchOpcode(LastOpc) &&
402 "unreachable unconditional branches removed above");
403
404 if (isCondBranchOpcode(LastOpc)) {
405 // Block ends with fall-through condbranch.
406 parseCondBranch(LastInst, TBB, Cond);
407 return false;
408 }
409 return true; // Can't handle indirect branch.
410 }
411 SecondLastInst = &*I;
412 SecondLastOpc = SecondLastInst->getOpcode();
413 }
414
415 // If there are three terminators, we don't know what sort of block this is.
416 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
417 return true;
418
419 // If the block ends with a B and a Bcc, handle it.
420 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
421 parseCondBranch(SecondLastInst, TBB, Cond);
422 FBB = LastInst->getOperand(0).getMBB();
423 return false;
424 }
425
426 // If the block ends with two unconditional branches, handle it. The second
427 // one is not executed, so remove it.
428 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
429 TBB = SecondLastInst->getOperand(0).getMBB();
430 I = LastInst;
431 if (AllowModify)
432 I->eraseFromParent();
433 return false;
434 }
435
436 // ...likewise if it ends with an indirect branch followed by an unconditional
437 // branch.
438 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
439 I = LastInst;
440 if (AllowModify)
441 I->eraseFromParent();
442 return true;
443 }
444
445 // Otherwise, can't handle this.
446 return true;
447 }
448
analyzeBranchPredicate(MachineBasicBlock & MBB,MachineBranchPredicate & MBP,bool AllowModify) const449 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
450 MachineBranchPredicate &MBP,
451 bool AllowModify) const {
452 // For the moment, handle only a block which ends with a cb(n)zx followed by
453 // a fallthrough. Why this? Because it is a common form.
454 // TODO: Should we handle b.cc?
455
456 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
457 if (I == MBB.end())
458 return true;
459
460 // Skip over SpeculationBarrierEndBB terminators
461 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
462 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
463 --I;
464 }
465
466 if (!isUnpredicatedTerminator(*I))
467 return true;
468
469 // Get the last instruction in the block.
470 MachineInstr *LastInst = &*I;
471 unsigned LastOpc = LastInst->getOpcode();
472 if (!isCondBranchOpcode(LastOpc))
473 return true;
474
475 switch (LastOpc) {
476 default:
477 return true;
478 case AArch64::CBZW:
479 case AArch64::CBZX:
480 case AArch64::CBNZW:
481 case AArch64::CBNZX:
482 break;
483 };
484
485 MBP.TrueDest = LastInst->getOperand(1).getMBB();
486 assert(MBP.TrueDest && "expected!");
487 MBP.FalseDest = MBB.getNextNode();
488
489 MBP.ConditionDef = nullptr;
490 MBP.SingleUseCondition = false;
491
492 MBP.LHS = LastInst->getOperand(0);
493 MBP.RHS = MachineOperand::CreateImm(0);
494 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
495 : MachineBranchPredicate::PRED_EQ;
496 return false;
497 }
498
reverseBranchCondition(SmallVectorImpl<MachineOperand> & Cond) const499 bool AArch64InstrInfo::reverseBranchCondition(
500 SmallVectorImpl<MachineOperand> &Cond) const {
501 if (Cond[0].getImm() != -1) {
502 // Regular Bcc
503 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
504 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
505 } else {
506 // Folded compare-and-branch
507 switch (Cond[1].getImm()) {
508 default:
509 llvm_unreachable("Unknown conditional branch!");
510 case AArch64::CBZW:
511 Cond[1].setImm(AArch64::CBNZW);
512 break;
513 case AArch64::CBNZW:
514 Cond[1].setImm(AArch64::CBZW);
515 break;
516 case AArch64::CBZX:
517 Cond[1].setImm(AArch64::CBNZX);
518 break;
519 case AArch64::CBNZX:
520 Cond[1].setImm(AArch64::CBZX);
521 break;
522 case AArch64::TBZW:
523 Cond[1].setImm(AArch64::TBNZW);
524 break;
525 case AArch64::TBNZW:
526 Cond[1].setImm(AArch64::TBZW);
527 break;
528 case AArch64::TBZX:
529 Cond[1].setImm(AArch64::TBNZX);
530 break;
531 case AArch64::TBNZX:
532 Cond[1].setImm(AArch64::TBZX);
533 break;
534 }
535 }
536
537 return false;
538 }
539
removeBranch(MachineBasicBlock & MBB,int * BytesRemoved) const540 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
541 int *BytesRemoved) const {
542 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
543 if (I == MBB.end())
544 return 0;
545
546 if (!isUncondBranchOpcode(I->getOpcode()) &&
547 !isCondBranchOpcode(I->getOpcode()))
548 return 0;
549
550 // Remove the branch.
551 I->eraseFromParent();
552
553 I = MBB.end();
554
555 if (I == MBB.begin()) {
556 if (BytesRemoved)
557 *BytesRemoved = 4;
558 return 1;
559 }
560 --I;
561 if (!isCondBranchOpcode(I->getOpcode())) {
562 if (BytesRemoved)
563 *BytesRemoved = 4;
564 return 1;
565 }
566
567 // Remove the branch.
568 I->eraseFromParent();
569 if (BytesRemoved)
570 *BytesRemoved = 8;
571
572 return 2;
573 }
574
instantiateCondBranch(MachineBasicBlock & MBB,const DebugLoc & DL,MachineBasicBlock * TBB,ArrayRef<MachineOperand> Cond) const575 void AArch64InstrInfo::instantiateCondBranch(
576 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
577 ArrayRef<MachineOperand> Cond) const {
578 if (Cond[0].getImm() != -1) {
579 // Regular Bcc
580 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
581 } else {
582 // Folded compare-and-branch
583 // Note that we use addOperand instead of addReg to keep the flags.
584 const MachineInstrBuilder MIB =
585 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
586 if (Cond.size() > 3)
587 MIB.addImm(Cond[3].getImm());
588 MIB.addMBB(TBB);
589 }
590 }
591
insertBranch(MachineBasicBlock & MBB,MachineBasicBlock * TBB,MachineBasicBlock * FBB,ArrayRef<MachineOperand> Cond,const DebugLoc & DL,int * BytesAdded) const592 unsigned AArch64InstrInfo::insertBranch(
593 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
594 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
595 // Shouldn't be a fall through.
596 assert(TBB && "insertBranch must not be told to insert a fallthrough");
597
598 if (!FBB) {
599 if (Cond.empty()) // Unconditional branch?
600 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
601 else
602 instantiateCondBranch(MBB, DL, TBB, Cond);
603
604 if (BytesAdded)
605 *BytesAdded = 4;
606
607 return 1;
608 }
609
610 // Two-way conditional branch.
611 instantiateCondBranch(MBB, DL, TBB, Cond);
612 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
613
614 if (BytesAdded)
615 *BytesAdded = 8;
616
617 return 2;
618 }
619
620 // Find the original register that VReg is copied from.
removeCopies(const MachineRegisterInfo & MRI,unsigned VReg)621 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
622 while (Register::isVirtualRegister(VReg)) {
623 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
624 if (!DefMI->isFullCopy())
625 return VReg;
626 VReg = DefMI->getOperand(1).getReg();
627 }
628 return VReg;
629 }
630
631 // Determine if VReg is defined by an instruction that can be folded into a
632 // csel instruction. If so, return the folded opcode, and the replacement
633 // register.
canFoldIntoCSel(const MachineRegisterInfo & MRI,unsigned VReg,unsigned * NewVReg=nullptr)634 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
635 unsigned *NewVReg = nullptr) {
636 VReg = removeCopies(MRI, VReg);
637 if (!Register::isVirtualRegister(VReg))
638 return 0;
639
640 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
641 const MachineInstr *DefMI = MRI.getVRegDef(VReg);
642 unsigned Opc = 0;
643 unsigned SrcOpNum = 0;
644 switch (DefMI->getOpcode()) {
645 case AArch64::ADDSXri:
646 case AArch64::ADDSWri:
647 // if NZCV is used, do not fold.
648 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
649 true) == -1)
650 return 0;
651 // fall-through to ADDXri and ADDWri.
652 [[fallthrough]];
653 case AArch64::ADDXri:
654 case AArch64::ADDWri:
655 // add x, 1 -> csinc.
656 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
657 DefMI->getOperand(3).getImm() != 0)
658 return 0;
659 SrcOpNum = 1;
660 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
661 break;
662
663 case AArch64::ORNXrr:
664 case AArch64::ORNWrr: {
665 // not x -> csinv, represented as orn dst, xzr, src.
666 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
667 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
668 return 0;
669 SrcOpNum = 2;
670 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
671 break;
672 }
673
674 case AArch64::SUBSXrr:
675 case AArch64::SUBSWrr:
676 // if NZCV is used, do not fold.
677 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
678 true) == -1)
679 return 0;
680 // fall-through to SUBXrr and SUBWrr.
681 [[fallthrough]];
682 case AArch64::SUBXrr:
683 case AArch64::SUBWrr: {
684 // neg x -> csneg, represented as sub dst, xzr, src.
685 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
686 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
687 return 0;
688 SrcOpNum = 2;
689 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
690 break;
691 }
692 default:
693 return 0;
694 }
695 assert(Opc && SrcOpNum && "Missing parameters");
696
697 if (NewVReg)
698 *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
699 return Opc;
700 }
701
canInsertSelect(const MachineBasicBlock & MBB,ArrayRef<MachineOperand> Cond,Register DstReg,Register TrueReg,Register FalseReg,int & CondCycles,int & TrueCycles,int & FalseCycles) const702 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
703 ArrayRef<MachineOperand> Cond,
704 Register DstReg, Register TrueReg,
705 Register FalseReg, int &CondCycles,
706 int &TrueCycles,
707 int &FalseCycles) const {
708 // Check register classes.
709 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
710 const TargetRegisterClass *RC =
711 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
712 if (!RC)
713 return false;
714
715 // Also need to check the dest regclass, in case we're trying to optimize
716 // something like:
717 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
718 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
719 return false;
720
721 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
722 unsigned ExtraCondLat = Cond.size() != 1;
723
724 // GPRs are handled by csel.
725 // FIXME: Fold in x+1, -x, and ~x when applicable.
726 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
727 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
728 // Single-cycle csel, csinc, csinv, and csneg.
729 CondCycles = 1 + ExtraCondLat;
730 TrueCycles = FalseCycles = 1;
731 if (canFoldIntoCSel(MRI, TrueReg))
732 TrueCycles = 0;
733 else if (canFoldIntoCSel(MRI, FalseReg))
734 FalseCycles = 0;
735 return true;
736 }
737
738 // Scalar floating point is handled by fcsel.
739 // FIXME: Form fabs, fmin, and fmax when applicable.
740 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
741 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
742 CondCycles = 5 + ExtraCondLat;
743 TrueCycles = FalseCycles = 2;
744 return true;
745 }
746
747 // Can't do vectors.
748 return false;
749 }
750
insertSelect(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,ArrayRef<MachineOperand> Cond,Register TrueReg,Register FalseReg) const751 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
752 MachineBasicBlock::iterator I,
753 const DebugLoc &DL, Register DstReg,
754 ArrayRef<MachineOperand> Cond,
755 Register TrueReg, Register FalseReg) const {
756 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
757
758 // Parse the condition code, see parseCondBranch() above.
759 AArch64CC::CondCode CC;
760 switch (Cond.size()) {
761 default:
762 llvm_unreachable("Unknown condition opcode in Cond");
763 case 1: // b.cc
764 CC = AArch64CC::CondCode(Cond[0].getImm());
765 break;
766 case 3: { // cbz/cbnz
767 // We must insert a compare against 0.
768 bool Is64Bit;
769 switch (Cond[1].getImm()) {
770 default:
771 llvm_unreachable("Unknown branch opcode in Cond");
772 case AArch64::CBZW:
773 Is64Bit = false;
774 CC = AArch64CC::EQ;
775 break;
776 case AArch64::CBZX:
777 Is64Bit = true;
778 CC = AArch64CC::EQ;
779 break;
780 case AArch64::CBNZW:
781 Is64Bit = false;
782 CC = AArch64CC::NE;
783 break;
784 case AArch64::CBNZX:
785 Is64Bit = true;
786 CC = AArch64CC::NE;
787 break;
788 }
789 Register SrcReg = Cond[2].getReg();
790 if (Is64Bit) {
791 // cmp reg, #0 is actually subs xzr, reg, #0.
792 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
793 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
794 .addReg(SrcReg)
795 .addImm(0)
796 .addImm(0);
797 } else {
798 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
799 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
800 .addReg(SrcReg)
801 .addImm(0)
802 .addImm(0);
803 }
804 break;
805 }
806 case 4: { // tbz/tbnz
807 // We must insert a tst instruction.
808 switch (Cond[1].getImm()) {
809 default:
810 llvm_unreachable("Unknown branch opcode in Cond");
811 case AArch64::TBZW:
812 case AArch64::TBZX:
813 CC = AArch64CC::EQ;
814 break;
815 case AArch64::TBNZW:
816 case AArch64::TBNZX:
817 CC = AArch64CC::NE;
818 break;
819 }
820 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
821 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
822 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
823 .addReg(Cond[2].getReg())
824 .addImm(
825 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
826 else
827 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
828 .addReg(Cond[2].getReg())
829 .addImm(
830 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
831 break;
832 }
833 }
834
835 unsigned Opc = 0;
836 const TargetRegisterClass *RC = nullptr;
837 bool TryFold = false;
838 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
839 RC = &AArch64::GPR64RegClass;
840 Opc = AArch64::CSELXr;
841 TryFold = true;
842 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
843 RC = &AArch64::GPR32RegClass;
844 Opc = AArch64::CSELWr;
845 TryFold = true;
846 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
847 RC = &AArch64::FPR64RegClass;
848 Opc = AArch64::FCSELDrrr;
849 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
850 RC = &AArch64::FPR32RegClass;
851 Opc = AArch64::FCSELSrrr;
852 }
853 assert(RC && "Unsupported regclass");
854
855 // Try folding simple instructions into the csel.
856 if (TryFold) {
857 unsigned NewVReg = 0;
858 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
859 if (FoldedOpc) {
860 // The folded opcodes csinc, csinc and csneg apply the operation to
861 // FalseReg, so we need to invert the condition.
862 CC = AArch64CC::getInvertedCondCode(CC);
863 TrueReg = FalseReg;
864 } else
865 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
866
867 // Fold the operation. Leave any dead instructions for DCE to clean up.
868 if (FoldedOpc) {
869 FalseReg = NewVReg;
870 Opc = FoldedOpc;
871 // The extends the live range of NewVReg.
872 MRI.clearKillFlags(NewVReg);
873 }
874 }
875
876 // Pull all virtual register into the appropriate class.
877 MRI.constrainRegClass(TrueReg, RC);
878 MRI.constrainRegClass(FalseReg, RC);
879
880 // Insert the csel.
881 BuildMI(MBB, I, DL, get(Opc), DstReg)
882 .addReg(TrueReg)
883 .addReg(FalseReg)
884 .addImm(CC);
885 }
886
887 // Return true if Imm can be loaded into a register by a "cheap" sequence of
888 // instructions. For now, "cheap" means at most two instructions.
isCheapImmediate(const MachineInstr & MI,unsigned BitSize)889 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
890 if (BitSize == 32)
891 return true;
892
893 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
894 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm());
895 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
896 AArch64_IMM::expandMOVImm(Imm, BitSize, Is);
897
898 return Is.size() <= 2;
899 }
900
901 // FIXME: this implementation should be micro-architecture dependent, so a
902 // micro-architecture target hook should be introduced here in future.
isAsCheapAsAMove(const MachineInstr & MI) const903 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
904 if (Subtarget.hasExynosCheapAsMoveHandling()) {
905 if (isExynosCheapAsMove(MI))
906 return true;
907 return MI.isAsCheapAsAMove();
908 }
909
910 switch (MI.getOpcode()) {
911 default:
912 return MI.isAsCheapAsAMove();
913
914 case AArch64::ADDWrs:
915 case AArch64::ADDXrs:
916 case AArch64::SUBWrs:
917 case AArch64::SUBXrs:
918 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
919
920 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
921 // ORRXri, it is as cheap as MOV.
922 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
923 case AArch64::MOVi32imm:
924 return isCheapImmediate(MI, 32);
925 case AArch64::MOVi64imm:
926 return isCheapImmediate(MI, 64);
927 }
928 }
929
isFalkorShiftExtFast(const MachineInstr & MI)930 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
931 switch (MI.getOpcode()) {
932 default:
933 return false;
934
935 case AArch64::ADDWrs:
936 case AArch64::ADDXrs:
937 case AArch64::ADDSWrs:
938 case AArch64::ADDSXrs: {
939 unsigned Imm = MI.getOperand(3).getImm();
940 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
941 if (ShiftVal == 0)
942 return true;
943 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
944 }
945
946 case AArch64::ADDWrx:
947 case AArch64::ADDXrx:
948 case AArch64::ADDXrx64:
949 case AArch64::ADDSWrx:
950 case AArch64::ADDSXrx:
951 case AArch64::ADDSXrx64: {
952 unsigned Imm = MI.getOperand(3).getImm();
953 switch (AArch64_AM::getArithExtendType(Imm)) {
954 default:
955 return false;
956 case AArch64_AM::UXTB:
957 case AArch64_AM::UXTH:
958 case AArch64_AM::UXTW:
959 case AArch64_AM::UXTX:
960 return AArch64_AM::getArithShiftValue(Imm) <= 4;
961 }
962 }
963
964 case AArch64::SUBWrs:
965 case AArch64::SUBSWrs: {
966 unsigned Imm = MI.getOperand(3).getImm();
967 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
968 return ShiftVal == 0 ||
969 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
970 }
971
972 case AArch64::SUBXrs:
973 case AArch64::SUBSXrs: {
974 unsigned Imm = MI.getOperand(3).getImm();
975 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
976 return ShiftVal == 0 ||
977 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
978 }
979
980 case AArch64::SUBWrx:
981 case AArch64::SUBXrx:
982 case AArch64::SUBXrx64:
983 case AArch64::SUBSWrx:
984 case AArch64::SUBSXrx:
985 case AArch64::SUBSXrx64: {
986 unsigned Imm = MI.getOperand(3).getImm();
987 switch (AArch64_AM::getArithExtendType(Imm)) {
988 default:
989 return false;
990 case AArch64_AM::UXTB:
991 case AArch64_AM::UXTH:
992 case AArch64_AM::UXTW:
993 case AArch64_AM::UXTX:
994 return AArch64_AM::getArithShiftValue(Imm) == 0;
995 }
996 }
997
998 case AArch64::LDRBBroW:
999 case AArch64::LDRBBroX:
1000 case AArch64::LDRBroW:
1001 case AArch64::LDRBroX:
1002 case AArch64::LDRDroW:
1003 case AArch64::LDRDroX:
1004 case AArch64::LDRHHroW:
1005 case AArch64::LDRHHroX:
1006 case AArch64::LDRHroW:
1007 case AArch64::LDRHroX:
1008 case AArch64::LDRQroW:
1009 case AArch64::LDRQroX:
1010 case AArch64::LDRSBWroW:
1011 case AArch64::LDRSBWroX:
1012 case AArch64::LDRSBXroW:
1013 case AArch64::LDRSBXroX:
1014 case AArch64::LDRSHWroW:
1015 case AArch64::LDRSHWroX:
1016 case AArch64::LDRSHXroW:
1017 case AArch64::LDRSHXroX:
1018 case AArch64::LDRSWroW:
1019 case AArch64::LDRSWroX:
1020 case AArch64::LDRSroW:
1021 case AArch64::LDRSroX:
1022 case AArch64::LDRWroW:
1023 case AArch64::LDRWroX:
1024 case AArch64::LDRXroW:
1025 case AArch64::LDRXroX:
1026 case AArch64::PRFMroW:
1027 case AArch64::PRFMroX:
1028 case AArch64::STRBBroW:
1029 case AArch64::STRBBroX:
1030 case AArch64::STRBroW:
1031 case AArch64::STRBroX:
1032 case AArch64::STRDroW:
1033 case AArch64::STRDroX:
1034 case AArch64::STRHHroW:
1035 case AArch64::STRHHroX:
1036 case AArch64::STRHroW:
1037 case AArch64::STRHroX:
1038 case AArch64::STRQroW:
1039 case AArch64::STRQroX:
1040 case AArch64::STRSroW:
1041 case AArch64::STRSroX:
1042 case AArch64::STRWroW:
1043 case AArch64::STRWroX:
1044 case AArch64::STRXroW:
1045 case AArch64::STRXroX: {
1046 unsigned IsSigned = MI.getOperand(3).getImm();
1047 return !IsSigned;
1048 }
1049 }
1050 }
1051
isSEHInstruction(const MachineInstr & MI)1052 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1053 unsigned Opc = MI.getOpcode();
1054 switch (Opc) {
1055 default:
1056 return false;
1057 case AArch64::SEH_StackAlloc:
1058 case AArch64::SEH_SaveFPLR:
1059 case AArch64::SEH_SaveFPLR_X:
1060 case AArch64::SEH_SaveReg:
1061 case AArch64::SEH_SaveReg_X:
1062 case AArch64::SEH_SaveRegP:
1063 case AArch64::SEH_SaveRegP_X:
1064 case AArch64::SEH_SaveFReg:
1065 case AArch64::SEH_SaveFReg_X:
1066 case AArch64::SEH_SaveFRegP:
1067 case AArch64::SEH_SaveFRegP_X:
1068 case AArch64::SEH_SetFP:
1069 case AArch64::SEH_AddFP:
1070 case AArch64::SEH_Nop:
1071 case AArch64::SEH_PrologEnd:
1072 case AArch64::SEH_EpilogStart:
1073 case AArch64::SEH_EpilogEnd:
1074 case AArch64::SEH_PACSignLR:
1075 case AArch64::SEH_SaveAnyRegQP:
1076 case AArch64::SEH_SaveAnyRegQPX:
1077 return true;
1078 }
1079 }
1080
isCoalescableExtInstr(const MachineInstr & MI,Register & SrcReg,Register & DstReg,unsigned & SubIdx) const1081 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1082 Register &SrcReg, Register &DstReg,
1083 unsigned &SubIdx) const {
1084 switch (MI.getOpcode()) {
1085 default:
1086 return false;
1087 case AArch64::SBFMXri: // aka sxtw
1088 case AArch64::UBFMXri: // aka uxtw
1089 // Check for the 32 -> 64 bit extension case, these instructions can do
1090 // much more.
1091 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
1092 return false;
1093 // This is a signed or unsigned 32 -> 64 bit extension.
1094 SrcReg = MI.getOperand(1).getReg();
1095 DstReg = MI.getOperand(0).getReg();
1096 SubIdx = AArch64::sub_32;
1097 return true;
1098 }
1099 }
1100
areMemAccessesTriviallyDisjoint(const MachineInstr & MIa,const MachineInstr & MIb) const1101 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1102 const MachineInstr &MIa, const MachineInstr &MIb) const {
1103 const TargetRegisterInfo *TRI = &getRegisterInfo();
1104 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1105 int64_t OffsetA = 0, OffsetB = 0;
1106 TypeSize WidthA(0, false), WidthB(0, false);
1107 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1108
1109 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1110 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1111
1112 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1113 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1114 return false;
1115
1116 // Retrieve the base, offset from the base and width. Width
1117 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1118 // base are identical, and the offset of a lower memory access +
1119 // the width doesn't overlap the offset of a higher memory access,
1120 // then the memory accesses are different.
1121 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1122 // are assumed to have the same scale (vscale).
1123 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
1124 WidthA, TRI) &&
1125 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
1126 WidthB, TRI)) {
1127 if (BaseOpA->isIdenticalTo(*BaseOpB) &&
1128 OffsetAIsScalable == OffsetBIsScalable) {
1129 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1130 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1131 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1132 if (LowWidth.isScalable() == OffsetAIsScalable &&
1133 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1134 return true;
1135 }
1136 }
1137 return false;
1138 }
1139
isSchedulingBoundary(const MachineInstr & MI,const MachineBasicBlock * MBB,const MachineFunction & MF) const1140 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1141 const MachineBasicBlock *MBB,
1142 const MachineFunction &MF) const {
1143 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1144 return true;
1145
1146 // Do not move an instruction that can be recognized as a branch target.
1147 if (hasBTISemantics(MI))
1148 return true;
1149
1150 switch (MI.getOpcode()) {
1151 case AArch64::HINT:
1152 // CSDB hints are scheduling barriers.
1153 if (MI.getOperand(0).getImm() == 0x14)
1154 return true;
1155 break;
1156 case AArch64::DSB:
1157 case AArch64::ISB:
1158 // DSB and ISB also are scheduling barriers.
1159 return true;
1160 case AArch64::MSRpstatesvcrImm1:
1161 // SMSTART and SMSTOP are also scheduling barriers.
1162 return true;
1163 default:;
1164 }
1165 if (isSEHInstruction(MI))
1166 return true;
1167 auto Next = std::next(MI.getIterator());
1168 return Next != MBB->end() && Next->isCFIInstruction();
1169 }
1170
1171 /// analyzeCompare - For a comparison instruction, return the source registers
1172 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1173 /// Return true if the comparison instruction can be analyzed.
analyzeCompare(const MachineInstr & MI,Register & SrcReg,Register & SrcReg2,int64_t & CmpMask,int64_t & CmpValue) const1174 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1175 Register &SrcReg2, int64_t &CmpMask,
1176 int64_t &CmpValue) const {
1177 // The first operand can be a frame index where we'd normally expect a
1178 // register.
1179 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1180 if (!MI.getOperand(1).isReg())
1181 return false;
1182
1183 switch (MI.getOpcode()) {
1184 default:
1185 break;
1186 case AArch64::PTEST_PP:
1187 case AArch64::PTEST_PP_ANY:
1188 SrcReg = MI.getOperand(0).getReg();
1189 SrcReg2 = MI.getOperand(1).getReg();
1190 // Not sure about the mask and value for now...
1191 CmpMask = ~0;
1192 CmpValue = 0;
1193 return true;
1194 case AArch64::SUBSWrr:
1195 case AArch64::SUBSWrs:
1196 case AArch64::SUBSWrx:
1197 case AArch64::SUBSXrr:
1198 case AArch64::SUBSXrs:
1199 case AArch64::SUBSXrx:
1200 case AArch64::ADDSWrr:
1201 case AArch64::ADDSWrs:
1202 case AArch64::ADDSWrx:
1203 case AArch64::ADDSXrr:
1204 case AArch64::ADDSXrs:
1205 case AArch64::ADDSXrx:
1206 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1207 SrcReg = MI.getOperand(1).getReg();
1208 SrcReg2 = MI.getOperand(2).getReg();
1209 CmpMask = ~0;
1210 CmpValue = 0;
1211 return true;
1212 case AArch64::SUBSWri:
1213 case AArch64::ADDSWri:
1214 case AArch64::SUBSXri:
1215 case AArch64::ADDSXri:
1216 SrcReg = MI.getOperand(1).getReg();
1217 SrcReg2 = 0;
1218 CmpMask = ~0;
1219 CmpValue = MI.getOperand(2).getImm();
1220 return true;
1221 case AArch64::ANDSWri:
1222 case AArch64::ANDSXri:
1223 // ANDS does not use the same encoding scheme as the others xxxS
1224 // instructions.
1225 SrcReg = MI.getOperand(1).getReg();
1226 SrcReg2 = 0;
1227 CmpMask = ~0;
1228 CmpValue = AArch64_AM::decodeLogicalImmediate(
1229 MI.getOperand(2).getImm(),
1230 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1231 return true;
1232 }
1233
1234 return false;
1235 }
1236
UpdateOperandRegClass(MachineInstr & Instr)1237 static bool UpdateOperandRegClass(MachineInstr &Instr) {
1238 MachineBasicBlock *MBB = Instr.getParent();
1239 assert(MBB && "Can't get MachineBasicBlock here");
1240 MachineFunction *MF = MBB->getParent();
1241 assert(MF && "Can't get MachineFunction here");
1242 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1243 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1244 MachineRegisterInfo *MRI = &MF->getRegInfo();
1245
1246 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1247 ++OpIdx) {
1248 MachineOperand &MO = Instr.getOperand(OpIdx);
1249 const TargetRegisterClass *OpRegCstraints =
1250 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1251
1252 // If there's no constraint, there's nothing to do.
1253 if (!OpRegCstraints)
1254 continue;
1255 // If the operand is a frame index, there's nothing to do here.
1256 // A frame index operand will resolve correctly during PEI.
1257 if (MO.isFI())
1258 continue;
1259
1260 assert(MO.isReg() &&
1261 "Operand has register constraints without being a register!");
1262
1263 Register Reg = MO.getReg();
1264 if (Reg.isPhysical()) {
1265 if (!OpRegCstraints->contains(Reg))
1266 return false;
1267 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
1268 !MRI->constrainRegClass(Reg, OpRegCstraints))
1269 return false;
1270 }
1271
1272 return true;
1273 }
1274
1275 /// Return the opcode that does not set flags when possible - otherwise
1276 /// return the original opcode. The caller is responsible to do the actual
1277 /// substitution and legality checking.
convertToNonFlagSettingOpc(const MachineInstr & MI)1278 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1279 // Don't convert all compare instructions, because for some the zero register
1280 // encoding becomes the sp register.
1281 bool MIDefinesZeroReg = false;
1282 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1283 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1284 MIDefinesZeroReg = true;
1285
1286 switch (MI.getOpcode()) {
1287 default:
1288 return MI.getOpcode();
1289 case AArch64::ADDSWrr:
1290 return AArch64::ADDWrr;
1291 case AArch64::ADDSWri:
1292 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1293 case AArch64::ADDSWrs:
1294 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1295 case AArch64::ADDSWrx:
1296 return AArch64::ADDWrx;
1297 case AArch64::ADDSXrr:
1298 return AArch64::ADDXrr;
1299 case AArch64::ADDSXri:
1300 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1301 case AArch64::ADDSXrs:
1302 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1303 case AArch64::ADDSXrx:
1304 return AArch64::ADDXrx;
1305 case AArch64::SUBSWrr:
1306 return AArch64::SUBWrr;
1307 case AArch64::SUBSWri:
1308 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1309 case AArch64::SUBSWrs:
1310 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1311 case AArch64::SUBSWrx:
1312 return AArch64::SUBWrx;
1313 case AArch64::SUBSXrr:
1314 return AArch64::SUBXrr;
1315 case AArch64::SUBSXri:
1316 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1317 case AArch64::SUBSXrs:
1318 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1319 case AArch64::SUBSXrx:
1320 return AArch64::SUBXrx;
1321 }
1322 }
1323
1324 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1325
1326 /// True when condition flags are accessed (either by writing or reading)
1327 /// on the instruction trace starting at From and ending at To.
1328 ///
1329 /// Note: If From and To are from different blocks it's assumed CC are accessed
1330 /// on the path.
areCFlagsAccessedBetweenInstrs(MachineBasicBlock::iterator From,MachineBasicBlock::iterator To,const TargetRegisterInfo * TRI,const AccessKind AccessToCheck=AK_All)1331 static bool areCFlagsAccessedBetweenInstrs(
1332 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1333 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1334 // Early exit if To is at the beginning of the BB.
1335 if (To == To->getParent()->begin())
1336 return true;
1337
1338 // Check whether the instructions are in the same basic block
1339 // If not, assume the condition flags might get modified somewhere.
1340 if (To->getParent() != From->getParent())
1341 return true;
1342
1343 // From must be above To.
1344 assert(std::any_of(
1345 ++To.getReverse(), To->getParent()->rend(),
1346 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1347
1348 // We iterate backward starting at \p To until we hit \p From.
1349 for (const MachineInstr &Instr :
1350 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
1351 if (((AccessToCheck & AK_Write) &&
1352 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1353 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1354 return true;
1355 }
1356 return false;
1357 }
1358
1359 std::optional<unsigned>
canRemovePTestInstr(MachineInstr * PTest,MachineInstr * Mask,MachineInstr * Pred,const MachineRegisterInfo * MRI) const1360 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
1361 MachineInstr *Pred,
1362 const MachineRegisterInfo *MRI) const {
1363 unsigned MaskOpcode = Mask->getOpcode();
1364 unsigned PredOpcode = Pred->getOpcode();
1365 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode);
1366 bool PredIsWhileLike = isWhileOpcode(PredOpcode);
1367
1368 if (PredIsWhileLike) {
1369 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc
1370 // instruction and the condition is "any" since WHILcc does an implicit
1371 // PTEST(ALL, PG) check and PG is always a subset of ALL.
1372 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1373 return PredOpcode;
1374
1375 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1376 // redundant since WHILE performs an implicit PTEST with an all active
1377 // mask.
1378 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1379 getElementSizeForOpcode(MaskOpcode) ==
1380 getElementSizeForOpcode(PredOpcode))
1381 return PredOpcode;
1382
1383 return {};
1384 }
1385
1386 if (PredIsPTestLike) {
1387 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1388 // instruction that sets the flags as PTEST would and the condition is
1389 // "any" since PG is always a subset of the governing predicate of the
1390 // ptest-like instruction.
1391 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1392 return PredOpcode;
1393
1394 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1395 // the element size matches and either the PTEST_LIKE instruction uses
1396 // the same all active mask or the condition is "any".
1397 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 &&
1398 getElementSizeForOpcode(MaskOpcode) ==
1399 getElementSizeForOpcode(PredOpcode)) {
1400 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1401 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY)
1402 return PredOpcode;
1403 }
1404
1405 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1406 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1407 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1408 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1409 // performed by the compare could consider fewer lanes for these element
1410 // sizes.
1411 //
1412 // For example, consider
1413 //
1414 // ptrue p0.b ; P0=1111-1111-1111-1111
1415 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1416 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1417 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1418 // ; ^ last active
1419 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1420 // ; ^ last active
1421 //
1422 // where the compare generates a canonical all active 32-bit predicate
1423 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1424 // active flag, whereas the PTEST instruction with the same mask doesn't.
1425 // For PTEST_ANY this doesn't apply as the flags in this case would be
1426 // identical regardless of element size.
1427 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1428 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
1429 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB ||
1430 PTest->getOpcode() == AArch64::PTEST_PP_ANY))
1431 return PredOpcode;
1432
1433 return {};
1434 }
1435
1436 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1437 // opcode so the PTEST becomes redundant.
1438 switch (PredOpcode) {
1439 case AArch64::AND_PPzPP:
1440 case AArch64::BIC_PPzPP:
1441 case AArch64::EOR_PPzPP:
1442 case AArch64::NAND_PPzPP:
1443 case AArch64::NOR_PPzPP:
1444 case AArch64::ORN_PPzPP:
1445 case AArch64::ORR_PPzPP:
1446 case AArch64::BRKA_PPzP:
1447 case AArch64::BRKPA_PPzPP:
1448 case AArch64::BRKB_PPzP:
1449 case AArch64::BRKPB_PPzPP:
1450 case AArch64::RDFFR_PPz: {
1451 // Check to see if our mask is the same. If not the resulting flag bits
1452 // may be different and we can't remove the ptest.
1453 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
1454 if (Mask != PredMask)
1455 return {};
1456 break;
1457 }
1458 case AArch64::BRKN_PPzP: {
1459 // BRKN uses an all active implicit mask to set flags unlike the other
1460 // flag-setting instructions.
1461 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1462 if ((MaskOpcode != AArch64::PTRUE_B) ||
1463 (Mask->getOperand(1).getImm() != 31))
1464 return {};
1465 break;
1466 }
1467 case AArch64::PTRUE_B:
1468 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1469 break;
1470 default:
1471 // Bail out if we don't recognize the input
1472 return {};
1473 }
1474
1475 return convertToFlagSettingOpc(PredOpcode);
1476 }
1477
1478 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1479 /// operation which could set the flags in an identical manner
optimizePTestInstr(MachineInstr * PTest,unsigned MaskReg,unsigned PredReg,const MachineRegisterInfo * MRI) const1480 bool AArch64InstrInfo::optimizePTestInstr(
1481 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1482 const MachineRegisterInfo *MRI) const {
1483 auto *Mask = MRI->getUniqueVRegDef(MaskReg);
1484 auto *Pred = MRI->getUniqueVRegDef(PredReg);
1485 unsigned PredOpcode = Pred->getOpcode();
1486 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI);
1487 if (!NewOp)
1488 return false;
1489
1490 const TargetRegisterInfo *TRI = &getRegisterInfo();
1491
1492 // If another instruction between Pred and PTest accesses flags, don't remove
1493 // the ptest or update the earlier instruction to modify them.
1494 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI))
1495 return false;
1496
1497 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1498 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1499 // operand to be replaced with an equivalent instruction that also sets the
1500 // flags.
1501 PTest->eraseFromParent();
1502 if (*NewOp != PredOpcode) {
1503 Pred->setDesc(get(*NewOp));
1504 bool succeeded = UpdateOperandRegClass(*Pred);
1505 (void)succeeded;
1506 assert(succeeded && "Operands have incompatible register classes!");
1507 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1508 }
1509
1510 // Ensure that the flags def is live.
1511 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1512 unsigned i = 0, e = Pred->getNumOperands();
1513 for (; i != e; ++i) {
1514 MachineOperand &MO = Pred->getOperand(i);
1515 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1516 MO.setIsDead(false);
1517 break;
1518 }
1519 }
1520 }
1521 return true;
1522 }
1523
1524 /// Try to optimize a compare instruction. A compare instruction is an
1525 /// instruction which produces AArch64::NZCV. It can be truly compare
1526 /// instruction
1527 /// when there are no uses of its destination register.
1528 ///
1529 /// The following steps are tried in order:
1530 /// 1. Convert CmpInstr into an unconditional version.
1531 /// 2. Remove CmpInstr if above there is an instruction producing a needed
1532 /// condition code or an instruction which can be converted into such an
1533 /// instruction.
1534 /// Only comparison with zero is supported.
optimizeCompareInstr(MachineInstr & CmpInstr,Register SrcReg,Register SrcReg2,int64_t CmpMask,int64_t CmpValue,const MachineRegisterInfo * MRI) const1535 bool AArch64InstrInfo::optimizeCompareInstr(
1536 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1537 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1538 assert(CmpInstr.getParent());
1539 assert(MRI);
1540
1541 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1542 int DeadNZCVIdx =
1543 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1544 if (DeadNZCVIdx != -1) {
1545 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1546 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1547 CmpInstr.eraseFromParent();
1548 return true;
1549 }
1550 unsigned Opc = CmpInstr.getOpcode();
1551 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
1552 if (NewOpc == Opc)
1553 return false;
1554 const MCInstrDesc &MCID = get(NewOpc);
1555 CmpInstr.setDesc(MCID);
1556 CmpInstr.removeOperand(DeadNZCVIdx);
1557 bool succeeded = UpdateOperandRegClass(CmpInstr);
1558 (void)succeeded;
1559 assert(succeeded && "Some operands reg class are incompatible!");
1560 return true;
1561 }
1562
1563 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1564 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1565 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
1566
1567 if (SrcReg2 != 0)
1568 return false;
1569
1570 // CmpInstr is a Compare instruction if destination register is not used.
1571 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
1572 return false;
1573
1574 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
1575 return true;
1576 return (CmpValue == 0 || CmpValue == 1) &&
1577 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
1578 }
1579
1580 /// Get opcode of S version of Instr.
1581 /// If Instr is S version its opcode is returned.
1582 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1583 /// or we are not interested in it.
sForm(MachineInstr & Instr)1584 static unsigned sForm(MachineInstr &Instr) {
1585 switch (Instr.getOpcode()) {
1586 default:
1587 return AArch64::INSTRUCTION_LIST_END;
1588
1589 case AArch64::ADDSWrr:
1590 case AArch64::ADDSWri:
1591 case AArch64::ADDSXrr:
1592 case AArch64::ADDSXri:
1593 case AArch64::SUBSWrr:
1594 case AArch64::SUBSWri:
1595 case AArch64::SUBSXrr:
1596 case AArch64::SUBSXri:
1597 return Instr.getOpcode();
1598
1599 case AArch64::ADDWrr:
1600 return AArch64::ADDSWrr;
1601 case AArch64::ADDWri:
1602 return AArch64::ADDSWri;
1603 case AArch64::ADDXrr:
1604 return AArch64::ADDSXrr;
1605 case AArch64::ADDXri:
1606 return AArch64::ADDSXri;
1607 case AArch64::ADCWr:
1608 return AArch64::ADCSWr;
1609 case AArch64::ADCXr:
1610 return AArch64::ADCSXr;
1611 case AArch64::SUBWrr:
1612 return AArch64::SUBSWrr;
1613 case AArch64::SUBWri:
1614 return AArch64::SUBSWri;
1615 case AArch64::SUBXrr:
1616 return AArch64::SUBSXrr;
1617 case AArch64::SUBXri:
1618 return AArch64::SUBSXri;
1619 case AArch64::SBCWr:
1620 return AArch64::SBCSWr;
1621 case AArch64::SBCXr:
1622 return AArch64::SBCSXr;
1623 case AArch64::ANDWri:
1624 return AArch64::ANDSWri;
1625 case AArch64::ANDXri:
1626 return AArch64::ANDSXri;
1627 }
1628 }
1629
1630 /// Check if AArch64::NZCV should be alive in successors of MBB.
areCFlagsAliveInSuccessors(const MachineBasicBlock * MBB)1631 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1632 for (auto *BB : MBB->successors())
1633 if (BB->isLiveIn(AArch64::NZCV))
1634 return true;
1635 return false;
1636 }
1637
1638 /// \returns The condition code operand index for \p Instr if it is a branch
1639 /// or select and -1 otherwise.
1640 static int
findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr & Instr)1641 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1642 switch (Instr.getOpcode()) {
1643 default:
1644 return -1;
1645
1646 case AArch64::Bcc: {
1647 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1648 assert(Idx >= 2);
1649 return Idx - 2;
1650 }
1651
1652 case AArch64::CSINVWr:
1653 case AArch64::CSINVXr:
1654 case AArch64::CSINCWr:
1655 case AArch64::CSINCXr:
1656 case AArch64::CSELWr:
1657 case AArch64::CSELXr:
1658 case AArch64::CSNEGWr:
1659 case AArch64::CSNEGXr:
1660 case AArch64::FCSELSrrr:
1661 case AArch64::FCSELDrrr: {
1662 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1663 assert(Idx >= 1);
1664 return Idx - 1;
1665 }
1666 }
1667 }
1668
1669 /// Find a condition code used by the instruction.
1670 /// Returns AArch64CC::Invalid if either the instruction does not use condition
1671 /// codes or we don't optimize CmpInstr in the presence of such instructions.
findCondCodeUsedByInstr(const MachineInstr & Instr)1672 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1673 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1674 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1675 Instr.getOperand(CCIdx).getImm())
1676 : AArch64CC::Invalid;
1677 }
1678
getUsedNZCV(AArch64CC::CondCode CC)1679 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1680 assert(CC != AArch64CC::Invalid);
1681 UsedNZCV UsedFlags;
1682 switch (CC) {
1683 default:
1684 break;
1685
1686 case AArch64CC::EQ: // Z set
1687 case AArch64CC::NE: // Z clear
1688 UsedFlags.Z = true;
1689 break;
1690
1691 case AArch64CC::HI: // Z clear and C set
1692 case AArch64CC::LS: // Z set or C clear
1693 UsedFlags.Z = true;
1694 [[fallthrough]];
1695 case AArch64CC::HS: // C set
1696 case AArch64CC::LO: // C clear
1697 UsedFlags.C = true;
1698 break;
1699
1700 case AArch64CC::MI: // N set
1701 case AArch64CC::PL: // N clear
1702 UsedFlags.N = true;
1703 break;
1704
1705 case AArch64CC::VS: // V set
1706 case AArch64CC::VC: // V clear
1707 UsedFlags.V = true;
1708 break;
1709
1710 case AArch64CC::GT: // Z clear, N and V the same
1711 case AArch64CC::LE: // Z set, N and V differ
1712 UsedFlags.Z = true;
1713 [[fallthrough]];
1714 case AArch64CC::GE: // N and V the same
1715 case AArch64CC::LT: // N and V differ
1716 UsedFlags.N = true;
1717 UsedFlags.V = true;
1718 break;
1719 }
1720 return UsedFlags;
1721 }
1722
1723 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1724 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1725 /// \returns std::nullopt otherwise.
1726 ///
1727 /// Collect instructions using that flags in \p CCUseInstrs if provided.
1728 std::optional<UsedNZCV>
examineCFlagsUse(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > * CCUseInstrs)1729 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1730 const TargetRegisterInfo &TRI,
1731 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1732 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1733 if (MI.getParent() != CmpParent)
1734 return std::nullopt;
1735
1736 if (areCFlagsAliveInSuccessors(CmpParent))
1737 return std::nullopt;
1738
1739 UsedNZCV NZCVUsedAfterCmp;
1740 for (MachineInstr &Instr : instructionsWithoutDebug(
1741 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) {
1742 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1743 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1744 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1745 return std::nullopt;
1746 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1747 if (CCUseInstrs)
1748 CCUseInstrs->push_back(&Instr);
1749 }
1750 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1751 break;
1752 }
1753 return NZCVUsedAfterCmp;
1754 }
1755
isADDSRegImm(unsigned Opcode)1756 static bool isADDSRegImm(unsigned Opcode) {
1757 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1758 }
1759
isSUBSRegImm(unsigned Opcode)1760 static bool isSUBSRegImm(unsigned Opcode) {
1761 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1762 }
1763
1764 /// Check if CmpInstr can be substituted by MI.
1765 ///
1766 /// CmpInstr can be substituted:
1767 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1768 /// - and, MI and CmpInstr are from the same MachineBB
1769 /// - and, condition flags are not alive in successors of the CmpInstr parent
1770 /// - and, if MI opcode is the S form there must be no defs of flags between
1771 /// MI and CmpInstr
1772 /// or if MI opcode is not the S form there must be neither defs of flags
1773 /// nor uses of flags between MI and CmpInstr.
1774 /// - and, if C/V flags are not used after CmpInstr
1775 /// or if N flag is used but MI produces poison value if signed overflow
1776 /// occurs.
canInstrSubstituteCmpInstr(MachineInstr & MI,MachineInstr & CmpInstr,const TargetRegisterInfo & TRI)1777 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1778 const TargetRegisterInfo &TRI) {
1779 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1780 // that may or may not set flags.
1781 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1782
1783 const unsigned CmpOpcode = CmpInstr.getOpcode();
1784 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
1785 return false;
1786
1787 assert((CmpInstr.getOperand(2).isImm() &&
1788 CmpInstr.getOperand(2).getImm() == 0) &&
1789 "Caller guarantees that CmpInstr compares with constant 0");
1790
1791 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1792 if (!NZVCUsed || NZVCUsed->C)
1793 return false;
1794
1795 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1796 // '%vreg = add ...' or '%vreg = sub ...'.
1797 // Condition flag V is used to indicate signed overflow.
1798 // 1) MI and CmpInstr set N and V to the same value.
1799 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1800 // signed overflow occurs, so CmpInstr could still be simplified away.
1801 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap))
1802 return false;
1803
1804 AccessKind AccessToCheck = AK_Write;
1805 if (sForm(MI) != MI.getOpcode())
1806 AccessToCheck = AK_All;
1807 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck);
1808 }
1809
1810 /// Substitute an instruction comparing to zero with another instruction
1811 /// which produces needed condition flags.
1812 ///
1813 /// Return true on success.
substituteCmpToZero(MachineInstr & CmpInstr,unsigned SrcReg,const MachineRegisterInfo & MRI) const1814 bool AArch64InstrInfo::substituteCmpToZero(
1815 MachineInstr &CmpInstr, unsigned SrcReg,
1816 const MachineRegisterInfo &MRI) const {
1817 // Get the unique definition of SrcReg.
1818 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1819 if (!MI)
1820 return false;
1821
1822 const TargetRegisterInfo &TRI = getRegisterInfo();
1823
1824 unsigned NewOpc = sForm(*MI);
1825 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1826 return false;
1827
1828 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI))
1829 return false;
1830
1831 // Update the instruction to set NZCV.
1832 MI->setDesc(get(NewOpc));
1833 CmpInstr.eraseFromParent();
1834 bool succeeded = UpdateOperandRegClass(*MI);
1835 (void)succeeded;
1836 assert(succeeded && "Some operands reg class are incompatible!");
1837 MI->addRegisterDefined(AArch64::NZCV, &TRI);
1838 return true;
1839 }
1840
1841 /// \returns True if \p CmpInstr can be removed.
1842 ///
1843 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1844 /// codes used in \p CCUseInstrs must be inverted.
canCmpInstrBeRemoved(MachineInstr & MI,MachineInstr & CmpInstr,int CmpValue,const TargetRegisterInfo & TRI,SmallVectorImpl<MachineInstr * > & CCUseInstrs,bool & IsInvertCC)1845 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1846 int CmpValue, const TargetRegisterInfo &TRI,
1847 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1848 bool &IsInvertCC) {
1849 assert((CmpValue == 0 || CmpValue == 1) &&
1850 "Only comparisons to 0 or 1 considered for removal!");
1851
1852 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1853 unsigned MIOpc = MI.getOpcode();
1854 if (MIOpc == AArch64::CSINCWr) {
1855 if (MI.getOperand(1).getReg() != AArch64::WZR ||
1856 MI.getOperand(2).getReg() != AArch64::WZR)
1857 return false;
1858 } else if (MIOpc == AArch64::CSINCXr) {
1859 if (MI.getOperand(1).getReg() != AArch64::XZR ||
1860 MI.getOperand(2).getReg() != AArch64::XZR)
1861 return false;
1862 } else {
1863 return false;
1864 }
1865 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI);
1866 if (MICC == AArch64CC::Invalid)
1867 return false;
1868
1869 // NZCV needs to be defined
1870 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1871 return false;
1872
1873 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1874 const unsigned CmpOpcode = CmpInstr.getOpcode();
1875 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode);
1876 if (CmpValue && !IsSubsRegImm)
1877 return false;
1878 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode))
1879 return false;
1880
1881 // MI conditions allowed: eq, ne, mi, pl
1882 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC);
1883 if (MIUsedNZCV.C || MIUsedNZCV.V)
1884 return false;
1885
1886 std::optional<UsedNZCV> NZCVUsedAfterCmp =
1887 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs);
1888 // Condition flags are not used in CmpInstr basic block successors and only
1889 // Z or N flags allowed to be used after CmpInstr within its basic block
1890 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1891 return false;
1892 // Z or N flag used after CmpInstr must correspond to the flag used in MI
1893 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1894 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1895 return false;
1896 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1897 if (MIUsedNZCV.N && !CmpValue)
1898 return false;
1899
1900 // There must be no defs of flags between MI and CmpInstr
1901 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write))
1902 return false;
1903
1904 // Condition code is inverted in the following cases:
1905 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1906 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1907 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1908 (!CmpValue && MICC == AArch64CC::NE);
1909 return true;
1910 }
1911
1912 /// Remove comparison in csinc-cmp sequence
1913 ///
1914 /// Examples:
1915 /// 1. \code
1916 /// csinc w9, wzr, wzr, ne
1917 /// cmp w9, #0
1918 /// b.eq
1919 /// \endcode
1920 /// to
1921 /// \code
1922 /// csinc w9, wzr, wzr, ne
1923 /// b.ne
1924 /// \endcode
1925 ///
1926 /// 2. \code
1927 /// csinc x2, xzr, xzr, mi
1928 /// cmp x2, #1
1929 /// b.pl
1930 /// \endcode
1931 /// to
1932 /// \code
1933 /// csinc x2, xzr, xzr, mi
1934 /// b.pl
1935 /// \endcode
1936 ///
1937 /// \param CmpInstr comparison instruction
1938 /// \return True when comparison removed
removeCmpToZeroOrOne(MachineInstr & CmpInstr,unsigned SrcReg,int CmpValue,const MachineRegisterInfo & MRI) const1939 bool AArch64InstrInfo::removeCmpToZeroOrOne(
1940 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1941 const MachineRegisterInfo &MRI) const {
1942 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg);
1943 if (!MI)
1944 return false;
1945 const TargetRegisterInfo &TRI = getRegisterInfo();
1946 SmallVector<MachineInstr *, 4> CCUseInstrs;
1947 bool IsInvertCC = false;
1948 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1949 IsInvertCC))
1950 return false;
1951 // Make transformation
1952 CmpInstr.eraseFromParent();
1953 if (IsInvertCC) {
1954 // Invert condition codes in CmpInstr CC users
1955 for (MachineInstr *CCUseInstr : CCUseInstrs) {
1956 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr);
1957 assert(Idx >= 0 && "Unexpected instruction using CC.");
1958 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx);
1959 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1960 static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1961 CCOperand.setImm(CCUse);
1962 }
1963 }
1964 return true;
1965 }
1966
expandPostRAPseudo(MachineInstr & MI) const1967 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1968 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1969 MI.getOpcode() != AArch64::CATCHRET)
1970 return false;
1971
1972 MachineBasicBlock &MBB = *MI.getParent();
1973 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1974 auto TRI = Subtarget.getRegisterInfo();
1975 DebugLoc DL = MI.getDebugLoc();
1976
1977 if (MI.getOpcode() == AArch64::CATCHRET) {
1978 // Skip to the first instruction before the epilog.
1979 const TargetInstrInfo *TII =
1980 MBB.getParent()->getSubtarget().getInstrInfo();
1981 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
1982 auto MBBI = MachineBasicBlock::iterator(MI);
1983 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI);
1984 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) &&
1985 FirstEpilogSEH != MBB.begin())
1986 FirstEpilogSEH = std::prev(FirstEpilogSEH);
1987 if (FirstEpilogSEH != MBB.begin())
1988 FirstEpilogSEH = std::next(FirstEpilogSEH);
1989 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1990 .addReg(AArch64::X0, RegState::Define)
1991 .addMBB(TargetMBB);
1992 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1993 .addReg(AArch64::X0, RegState::Define)
1994 .addReg(AArch64::X0)
1995 .addMBB(TargetMBB)
1996 .addImm(0);
1997 return true;
1998 }
1999
2000 Register Reg = MI.getOperand(0).getReg();
2001 Module &M = *MBB.getParent()->getFunction().getParent();
2002 if (M.getStackProtectorGuard() == "sysreg") {
2003 const AArch64SysReg::SysReg *SrcReg =
2004 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
2005 if (!SrcReg)
2006 report_fatal_error("Unknown SysReg for Stack Protector Guard Register");
2007
2008 // mrs xN, sysreg
2009 BuildMI(MBB, MI, DL, get(AArch64::MRS))
2010 .addDef(Reg, RegState::Renamable)
2011 .addImm(SrcReg->Encoding);
2012 int Offset = M.getStackProtectorGuardOffset();
2013 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
2014 // ldr xN, [xN, #offset]
2015 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2016 .addDef(Reg)
2017 .addUse(Reg, RegState::Kill)
2018 .addImm(Offset / 8);
2019 } else if (Offset >= -256 && Offset <= 255) {
2020 // ldur xN, [xN, #offset]
2021 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2022 .addDef(Reg)
2023 .addUse(Reg, RegState::Kill)
2024 .addImm(Offset);
2025 } else if (Offset >= -4095 && Offset <= 4095) {
2026 if (Offset > 0) {
2027 // add xN, xN, #offset
2028 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2029 .addDef(Reg)
2030 .addUse(Reg, RegState::Kill)
2031 .addImm(Offset)
2032 .addImm(0);
2033 } else {
2034 // sub xN, xN, #offset
2035 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2036 .addDef(Reg)
2037 .addUse(Reg, RegState::Kill)
2038 .addImm(-Offset)
2039 .addImm(0);
2040 }
2041 // ldr xN, [xN]
2042 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2043 .addDef(Reg)
2044 .addUse(Reg, RegState::Kill)
2045 .addImm(0);
2046 } else {
2047 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2048 // than 23760.
2049 // It might be nice to use AArch64::MOVi32imm here, which would get
2050 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2051 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2052 // AArch64FrameLowering might help us find such a scratch register
2053 // though. If we failed to find a scratch register, we could emit a
2054 // stream of add instructions to build up the immediate. Or, we could try
2055 // to insert a AArch64::MOVi32imm before register allocation so that we
2056 // didn't need to scavenge for a scratch register.
2057 report_fatal_error("Unable to encode Stack Protector Guard Offset");
2058 }
2059 MBB.erase(MI);
2060 return true;
2061 }
2062
2063 const GlobalValue *GV =
2064 cast<GlobalValue>((*MI.memoperands_begin())->getValue());
2065 const TargetMachine &TM = MBB.getParent()->getTarget();
2066 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2067 const unsigned char MO_NC = AArch64II::MO_NC;
2068
2069 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2070 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2071 .addGlobalAddress(GV, 0, OpFlags);
2072 if (Subtarget.isTargetILP32()) {
2073 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2074 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2075 .addDef(Reg32, RegState::Dead)
2076 .addUse(Reg, RegState::Kill)
2077 .addImm(0)
2078 .addMemOperand(*MI.memoperands_begin())
2079 .addDef(Reg, RegState::Implicit);
2080 } else {
2081 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2082 .addReg(Reg, RegState::Kill)
2083 .addImm(0)
2084 .addMemOperand(*MI.memoperands_begin());
2085 }
2086 } else if (TM.getCodeModel() == CodeModel::Large) {
2087 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2088 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2089 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2090 .addImm(0);
2091 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2092 .addReg(Reg, RegState::Kill)
2093 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2094 .addImm(16);
2095 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2096 .addReg(Reg, RegState::Kill)
2097 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2098 .addImm(32);
2099 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2100 .addReg(Reg, RegState::Kill)
2101 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2102 .addImm(48);
2103 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2104 .addReg(Reg, RegState::Kill)
2105 .addImm(0)
2106 .addMemOperand(*MI.memoperands_begin());
2107 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2108 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2109 .addGlobalAddress(GV, 0, OpFlags);
2110 } else {
2111 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2112 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2113 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2114 if (Subtarget.isTargetILP32()) {
2115 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2116 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2117 .addDef(Reg32, RegState::Dead)
2118 .addUse(Reg, RegState::Kill)
2119 .addGlobalAddress(GV, 0, LoFlags)
2120 .addMemOperand(*MI.memoperands_begin())
2121 .addDef(Reg, RegState::Implicit);
2122 } else {
2123 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2124 .addReg(Reg, RegState::Kill)
2125 .addGlobalAddress(GV, 0, LoFlags)
2126 .addMemOperand(*MI.memoperands_begin());
2127 }
2128 }
2129
2130 MBB.erase(MI);
2131
2132 return true;
2133 }
2134
2135 // Return true if this instruction simply sets its single destination register
2136 // to zero. This is equivalent to a register rename of the zero-register.
isGPRZero(const MachineInstr & MI)2137 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2138 switch (MI.getOpcode()) {
2139 default:
2140 break;
2141 case AArch64::MOVZWi:
2142 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2143 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
2144 assert(MI.getDesc().getNumOperands() == 3 &&
2145 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2146 return true;
2147 }
2148 break;
2149 case AArch64::ANDWri: // and Rd, Rzr, #imm
2150 return MI.getOperand(1).getReg() == AArch64::WZR;
2151 case AArch64::ANDXri:
2152 return MI.getOperand(1).getReg() == AArch64::XZR;
2153 case TargetOpcode::COPY:
2154 return MI.getOperand(1).getReg() == AArch64::WZR;
2155 }
2156 return false;
2157 }
2158
2159 // Return true if this instruction simply renames a general register without
2160 // modifying bits.
isGPRCopy(const MachineInstr & MI)2161 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2162 switch (MI.getOpcode()) {
2163 default:
2164 break;
2165 case TargetOpcode::COPY: {
2166 // GPR32 copies will by lowered to ORRXrs
2167 Register DstReg = MI.getOperand(0).getReg();
2168 return (AArch64::GPR32RegClass.contains(DstReg) ||
2169 AArch64::GPR64RegClass.contains(DstReg));
2170 }
2171 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2172 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2173 assert(MI.getDesc().getNumOperands() == 4 &&
2174 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2175 return true;
2176 }
2177 break;
2178 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2179 if (MI.getOperand(2).getImm() == 0) {
2180 assert(MI.getDesc().getNumOperands() == 4 &&
2181 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2182 return true;
2183 }
2184 break;
2185 }
2186 return false;
2187 }
2188
2189 // Return true if this instruction simply renames a general register without
2190 // modifying bits.
isFPRCopy(const MachineInstr & MI)2191 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2192 switch (MI.getOpcode()) {
2193 default:
2194 break;
2195 case TargetOpcode::COPY: {
2196 Register DstReg = MI.getOperand(0).getReg();
2197 return AArch64::FPR128RegClass.contains(DstReg);
2198 }
2199 case AArch64::ORRv16i8:
2200 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
2201 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2202 "invalid ORRv16i8 operands");
2203 return true;
2204 }
2205 break;
2206 }
2207 return false;
2208 }
2209
isLoadFromStackSlot(const MachineInstr & MI,int & FrameIndex) const2210 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2211 int &FrameIndex) const {
2212 switch (MI.getOpcode()) {
2213 default:
2214 break;
2215 case AArch64::LDRWui:
2216 case AArch64::LDRXui:
2217 case AArch64::LDRBui:
2218 case AArch64::LDRHui:
2219 case AArch64::LDRSui:
2220 case AArch64::LDRDui:
2221 case AArch64::LDRQui:
2222 case AArch64::LDR_PXI:
2223 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2224 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2225 FrameIndex = MI.getOperand(1).getIndex();
2226 return MI.getOperand(0).getReg();
2227 }
2228 break;
2229 }
2230
2231 return 0;
2232 }
2233
isStoreToStackSlot(const MachineInstr & MI,int & FrameIndex) const2234 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2235 int &FrameIndex) const {
2236 switch (MI.getOpcode()) {
2237 default:
2238 break;
2239 case AArch64::STRWui:
2240 case AArch64::STRXui:
2241 case AArch64::STRBui:
2242 case AArch64::STRHui:
2243 case AArch64::STRSui:
2244 case AArch64::STRDui:
2245 case AArch64::STRQui:
2246 case AArch64::STR_PXI:
2247 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
2248 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
2249 FrameIndex = MI.getOperand(1).getIndex();
2250 return MI.getOperand(0).getReg();
2251 }
2252 break;
2253 }
2254 return 0;
2255 }
2256
2257 /// Check all MachineMemOperands for a hint to suppress pairing.
isLdStPairSuppressed(const MachineInstr & MI)2258 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2259 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2260 return MMO->getFlags() & MOSuppressPair;
2261 });
2262 }
2263
2264 /// Set a flag on the first MachineMemOperand to suppress pairing.
suppressLdStPair(MachineInstr & MI)2265 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2266 if (MI.memoperands_empty())
2267 return;
2268 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2269 }
2270
2271 /// Check all MachineMemOperands for a hint that the load/store is strided.
isStridedAccess(const MachineInstr & MI)2272 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2273 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
2274 return MMO->getFlags() & MOStridedAccess;
2275 });
2276 }
2277
hasUnscaledLdStOffset(unsigned Opc)2278 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2279 switch (Opc) {
2280 default:
2281 return false;
2282 case AArch64::STURSi:
2283 case AArch64::STRSpre:
2284 case AArch64::STURDi:
2285 case AArch64::STRDpre:
2286 case AArch64::STURQi:
2287 case AArch64::STRQpre:
2288 case AArch64::STURBBi:
2289 case AArch64::STURHHi:
2290 case AArch64::STURWi:
2291 case AArch64::STRWpre:
2292 case AArch64::STURXi:
2293 case AArch64::STRXpre:
2294 case AArch64::LDURSi:
2295 case AArch64::LDRSpre:
2296 case AArch64::LDURDi:
2297 case AArch64::LDRDpre:
2298 case AArch64::LDURQi:
2299 case AArch64::LDRQpre:
2300 case AArch64::LDURWi:
2301 case AArch64::LDRWpre:
2302 case AArch64::LDURXi:
2303 case AArch64::LDRXpre:
2304 case AArch64::LDRSWpre:
2305 case AArch64::LDURSWi:
2306 case AArch64::LDURHHi:
2307 case AArch64::LDURBBi:
2308 case AArch64::LDURSBWi:
2309 case AArch64::LDURSHWi:
2310 return true;
2311 }
2312 }
2313
getUnscaledLdSt(unsigned Opc)2314 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2315 switch (Opc) {
2316 default: return {};
2317 case AArch64::PRFMui: return AArch64::PRFUMi;
2318 case AArch64::LDRXui: return AArch64::LDURXi;
2319 case AArch64::LDRWui: return AArch64::LDURWi;
2320 case AArch64::LDRBui: return AArch64::LDURBi;
2321 case AArch64::LDRHui: return AArch64::LDURHi;
2322 case AArch64::LDRSui: return AArch64::LDURSi;
2323 case AArch64::LDRDui: return AArch64::LDURDi;
2324 case AArch64::LDRQui: return AArch64::LDURQi;
2325 case AArch64::LDRBBui: return AArch64::LDURBBi;
2326 case AArch64::LDRHHui: return AArch64::LDURHHi;
2327 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2328 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2329 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2330 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2331 case AArch64::LDRSWui: return AArch64::LDURSWi;
2332 case AArch64::STRXui: return AArch64::STURXi;
2333 case AArch64::STRWui: return AArch64::STURWi;
2334 case AArch64::STRBui: return AArch64::STURBi;
2335 case AArch64::STRHui: return AArch64::STURHi;
2336 case AArch64::STRSui: return AArch64::STURSi;
2337 case AArch64::STRDui: return AArch64::STURDi;
2338 case AArch64::STRQui: return AArch64::STURQi;
2339 case AArch64::STRBBui: return AArch64::STURBBi;
2340 case AArch64::STRHHui: return AArch64::STURHHi;
2341 }
2342 }
2343
getLoadStoreImmIdx(unsigned Opc)2344 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2345 switch (Opc) {
2346 default:
2347 return 2;
2348 case AArch64::LDPXi:
2349 case AArch64::LDPDi:
2350 case AArch64::STPXi:
2351 case AArch64::STPDi:
2352 case AArch64::LDNPXi:
2353 case AArch64::LDNPDi:
2354 case AArch64::STNPXi:
2355 case AArch64::STNPDi:
2356 case AArch64::LDPQi:
2357 case AArch64::STPQi:
2358 case AArch64::LDNPQi:
2359 case AArch64::STNPQi:
2360 case AArch64::LDPWi:
2361 case AArch64::LDPSi:
2362 case AArch64::STPWi:
2363 case AArch64::STPSi:
2364 case AArch64::LDNPWi:
2365 case AArch64::LDNPSi:
2366 case AArch64::STNPWi:
2367 case AArch64::STNPSi:
2368 case AArch64::LDG:
2369 case AArch64::STGPi:
2370
2371 case AArch64::LD1B_IMM:
2372 case AArch64::LD1B_H_IMM:
2373 case AArch64::LD1B_S_IMM:
2374 case AArch64::LD1B_D_IMM:
2375 case AArch64::LD1SB_H_IMM:
2376 case AArch64::LD1SB_S_IMM:
2377 case AArch64::LD1SB_D_IMM:
2378 case AArch64::LD1H_IMM:
2379 case AArch64::LD1H_S_IMM:
2380 case AArch64::LD1H_D_IMM:
2381 case AArch64::LD1SH_S_IMM:
2382 case AArch64::LD1SH_D_IMM:
2383 case AArch64::LD1W_IMM:
2384 case AArch64::LD1W_D_IMM:
2385 case AArch64::LD1SW_D_IMM:
2386 case AArch64::LD1D_IMM:
2387
2388 case AArch64::LD2B_IMM:
2389 case AArch64::LD2H_IMM:
2390 case AArch64::LD2W_IMM:
2391 case AArch64::LD2D_IMM:
2392 case AArch64::LD3B_IMM:
2393 case AArch64::LD3H_IMM:
2394 case AArch64::LD3W_IMM:
2395 case AArch64::LD3D_IMM:
2396 case AArch64::LD4B_IMM:
2397 case AArch64::LD4H_IMM:
2398 case AArch64::LD4W_IMM:
2399 case AArch64::LD4D_IMM:
2400
2401 case AArch64::ST1B_IMM:
2402 case AArch64::ST1B_H_IMM:
2403 case AArch64::ST1B_S_IMM:
2404 case AArch64::ST1B_D_IMM:
2405 case AArch64::ST1H_IMM:
2406 case AArch64::ST1H_S_IMM:
2407 case AArch64::ST1H_D_IMM:
2408 case AArch64::ST1W_IMM:
2409 case AArch64::ST1W_D_IMM:
2410 case AArch64::ST1D_IMM:
2411
2412 case AArch64::ST2B_IMM:
2413 case AArch64::ST2H_IMM:
2414 case AArch64::ST2W_IMM:
2415 case AArch64::ST2D_IMM:
2416 case AArch64::ST3B_IMM:
2417 case AArch64::ST3H_IMM:
2418 case AArch64::ST3W_IMM:
2419 case AArch64::ST3D_IMM:
2420 case AArch64::ST4B_IMM:
2421 case AArch64::ST4H_IMM:
2422 case AArch64::ST4W_IMM:
2423 case AArch64::ST4D_IMM:
2424
2425 case AArch64::LD1RB_IMM:
2426 case AArch64::LD1RB_H_IMM:
2427 case AArch64::LD1RB_S_IMM:
2428 case AArch64::LD1RB_D_IMM:
2429 case AArch64::LD1RSB_H_IMM:
2430 case AArch64::LD1RSB_S_IMM:
2431 case AArch64::LD1RSB_D_IMM:
2432 case AArch64::LD1RH_IMM:
2433 case AArch64::LD1RH_S_IMM:
2434 case AArch64::LD1RH_D_IMM:
2435 case AArch64::LD1RSH_S_IMM:
2436 case AArch64::LD1RSH_D_IMM:
2437 case AArch64::LD1RW_IMM:
2438 case AArch64::LD1RW_D_IMM:
2439 case AArch64::LD1RSW_IMM:
2440 case AArch64::LD1RD_IMM:
2441
2442 case AArch64::LDNT1B_ZRI:
2443 case AArch64::LDNT1H_ZRI:
2444 case AArch64::LDNT1W_ZRI:
2445 case AArch64::LDNT1D_ZRI:
2446 case AArch64::STNT1B_ZRI:
2447 case AArch64::STNT1H_ZRI:
2448 case AArch64::STNT1W_ZRI:
2449 case AArch64::STNT1D_ZRI:
2450
2451 case AArch64::LDNF1B_IMM:
2452 case AArch64::LDNF1B_H_IMM:
2453 case AArch64::LDNF1B_S_IMM:
2454 case AArch64::LDNF1B_D_IMM:
2455 case AArch64::LDNF1SB_H_IMM:
2456 case AArch64::LDNF1SB_S_IMM:
2457 case AArch64::LDNF1SB_D_IMM:
2458 case AArch64::LDNF1H_IMM:
2459 case AArch64::LDNF1H_S_IMM:
2460 case AArch64::LDNF1H_D_IMM:
2461 case AArch64::LDNF1SH_S_IMM:
2462 case AArch64::LDNF1SH_D_IMM:
2463 case AArch64::LDNF1W_IMM:
2464 case AArch64::LDNF1W_D_IMM:
2465 case AArch64::LDNF1SW_D_IMM:
2466 case AArch64::LDNF1D_IMM:
2467 return 3;
2468 case AArch64::ADDG:
2469 case AArch64::STGi:
2470 case AArch64::LDR_PXI:
2471 case AArch64::STR_PXI:
2472 return 2;
2473 }
2474 }
2475
isPairableLdStInst(const MachineInstr & MI)2476 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2477 switch (MI.getOpcode()) {
2478 default:
2479 return false;
2480 // Scaled instructions.
2481 case AArch64::STRSui:
2482 case AArch64::STRDui:
2483 case AArch64::STRQui:
2484 case AArch64::STRXui:
2485 case AArch64::STRWui:
2486 case AArch64::LDRSui:
2487 case AArch64::LDRDui:
2488 case AArch64::LDRQui:
2489 case AArch64::LDRXui:
2490 case AArch64::LDRWui:
2491 case AArch64::LDRSWui:
2492 // Unscaled instructions.
2493 case AArch64::STURSi:
2494 case AArch64::STRSpre:
2495 case AArch64::STURDi:
2496 case AArch64::STRDpre:
2497 case AArch64::STURQi:
2498 case AArch64::STRQpre:
2499 case AArch64::STURWi:
2500 case AArch64::STRWpre:
2501 case AArch64::STURXi:
2502 case AArch64::STRXpre:
2503 case AArch64::LDURSi:
2504 case AArch64::LDRSpre:
2505 case AArch64::LDURDi:
2506 case AArch64::LDRDpre:
2507 case AArch64::LDURQi:
2508 case AArch64::LDRQpre:
2509 case AArch64::LDURWi:
2510 case AArch64::LDRWpre:
2511 case AArch64::LDURXi:
2512 case AArch64::LDRXpre:
2513 case AArch64::LDURSWi:
2514 case AArch64::LDRSWpre:
2515 return true;
2516 }
2517 }
2518
isTailCallReturnInst(const MachineInstr & MI)2519 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2520 switch (MI.getOpcode()) {
2521 default:
2522 assert((!MI.isCall() || !MI.isReturn()) &&
2523 "Unexpected instruction - was a new tail call opcode introduced?");
2524 return false;
2525 case AArch64::TCRETURNdi:
2526 case AArch64::TCRETURNri:
2527 case AArch64::TCRETURNrix16x17:
2528 case AArch64::TCRETURNrix17:
2529 case AArch64::TCRETURNrinotx16:
2530 case AArch64::TCRETURNriALL:
2531 case AArch64::AUTH_TCRETURN:
2532 case AArch64::AUTH_TCRETURN_BTI:
2533 return true;
2534 }
2535 }
2536
convertToFlagSettingOpc(unsigned Opc)2537 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2538 switch (Opc) {
2539 default:
2540 llvm_unreachable("Opcode has no flag setting equivalent!");
2541 // 32-bit cases:
2542 case AArch64::ADDWri:
2543 return AArch64::ADDSWri;
2544 case AArch64::ADDWrr:
2545 return AArch64::ADDSWrr;
2546 case AArch64::ADDWrs:
2547 return AArch64::ADDSWrs;
2548 case AArch64::ADDWrx:
2549 return AArch64::ADDSWrx;
2550 case AArch64::ANDWri:
2551 return AArch64::ANDSWri;
2552 case AArch64::ANDWrr:
2553 return AArch64::ANDSWrr;
2554 case AArch64::ANDWrs:
2555 return AArch64::ANDSWrs;
2556 case AArch64::BICWrr:
2557 return AArch64::BICSWrr;
2558 case AArch64::BICWrs:
2559 return AArch64::BICSWrs;
2560 case AArch64::SUBWri:
2561 return AArch64::SUBSWri;
2562 case AArch64::SUBWrr:
2563 return AArch64::SUBSWrr;
2564 case AArch64::SUBWrs:
2565 return AArch64::SUBSWrs;
2566 case AArch64::SUBWrx:
2567 return AArch64::SUBSWrx;
2568 // 64-bit cases:
2569 case AArch64::ADDXri:
2570 return AArch64::ADDSXri;
2571 case AArch64::ADDXrr:
2572 return AArch64::ADDSXrr;
2573 case AArch64::ADDXrs:
2574 return AArch64::ADDSXrs;
2575 case AArch64::ADDXrx:
2576 return AArch64::ADDSXrx;
2577 case AArch64::ANDXri:
2578 return AArch64::ANDSXri;
2579 case AArch64::ANDXrr:
2580 return AArch64::ANDSXrr;
2581 case AArch64::ANDXrs:
2582 return AArch64::ANDSXrs;
2583 case AArch64::BICXrr:
2584 return AArch64::BICSXrr;
2585 case AArch64::BICXrs:
2586 return AArch64::BICSXrs;
2587 case AArch64::SUBXri:
2588 return AArch64::SUBSXri;
2589 case AArch64::SUBXrr:
2590 return AArch64::SUBSXrr;
2591 case AArch64::SUBXrs:
2592 return AArch64::SUBSXrs;
2593 case AArch64::SUBXrx:
2594 return AArch64::SUBSXrx;
2595 // SVE instructions:
2596 case AArch64::AND_PPzPP:
2597 return AArch64::ANDS_PPzPP;
2598 case AArch64::BIC_PPzPP:
2599 return AArch64::BICS_PPzPP;
2600 case AArch64::EOR_PPzPP:
2601 return AArch64::EORS_PPzPP;
2602 case AArch64::NAND_PPzPP:
2603 return AArch64::NANDS_PPzPP;
2604 case AArch64::NOR_PPzPP:
2605 return AArch64::NORS_PPzPP;
2606 case AArch64::ORN_PPzPP:
2607 return AArch64::ORNS_PPzPP;
2608 case AArch64::ORR_PPzPP:
2609 return AArch64::ORRS_PPzPP;
2610 case AArch64::BRKA_PPzP:
2611 return AArch64::BRKAS_PPzP;
2612 case AArch64::BRKPA_PPzPP:
2613 return AArch64::BRKPAS_PPzPP;
2614 case AArch64::BRKB_PPzP:
2615 return AArch64::BRKBS_PPzP;
2616 case AArch64::BRKPB_PPzPP:
2617 return AArch64::BRKPBS_PPzPP;
2618 case AArch64::BRKN_PPzP:
2619 return AArch64::BRKNS_PPzP;
2620 case AArch64::RDFFR_PPz:
2621 return AArch64::RDFFRS_PPz;
2622 case AArch64::PTRUE_B:
2623 return AArch64::PTRUES_B;
2624 }
2625 }
2626
2627 // Is this a candidate for ld/st merging or pairing? For example, we don't
2628 // touch volatiles or load/stores that have a hint to avoid pair formation.
isCandidateToMergeOrPair(const MachineInstr & MI) const2629 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2630
2631 bool IsPreLdSt = isPreLdSt(MI);
2632
2633 // If this is a volatile load/store, don't mess with it.
2634 if (MI.hasOrderedMemoryRef())
2635 return false;
2636
2637 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2638 // For Pre-inc LD/ST, the operand is shifted by one.
2639 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2640 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2641 "Expected a reg or frame index operand.");
2642
2643 // For Pre-indexed addressing quadword instructions, the third operand is the
2644 // immediate value.
2645 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm();
2646
2647 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt)
2648 return false;
2649
2650 // Can't merge/pair if the instruction modifies the base register.
2651 // e.g., ldr x0, [x0]
2652 // This case will never occur with an FI base.
2653 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2654 // STR<S,D,Q,W,X>pre, it can be merged.
2655 // For example:
2656 // ldr q0, [x11, #32]!
2657 // ldr q1, [x11, #16]
2658 // to
2659 // ldp q0, q1, [x11, #32]!
2660 if (MI.getOperand(1).isReg() && !IsPreLdSt) {
2661 Register BaseReg = MI.getOperand(1).getReg();
2662 const TargetRegisterInfo *TRI = &getRegisterInfo();
2663 if (MI.modifiesRegister(BaseReg, TRI))
2664 return false;
2665 }
2666
2667 // Check if this load/store has a hint to avoid pair formation.
2668 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2669 if (isLdStPairSuppressed(MI))
2670 return false;
2671
2672 // Do not pair any callee-save store/reload instructions in the
2673 // prologue/epilogue if the CFI information encoded the operations as separate
2674 // instructions, as that will cause the size of the actual prologue to mismatch
2675 // with the prologue size recorded in the Windows CFI.
2676 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2677 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2678 MI.getMF()->getFunction().needsUnwindTableEntry();
2679 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) ||
2680 MI.getFlag(MachineInstr::FrameDestroy)))
2681 return false;
2682
2683 // On some CPUs quad load/store pairs are slower than two single load/stores.
2684 if (Subtarget.isPaired128Slow()) {
2685 switch (MI.getOpcode()) {
2686 default:
2687 break;
2688 case AArch64::LDURQi:
2689 case AArch64::STURQi:
2690 case AArch64::LDRQui:
2691 case AArch64::STRQui:
2692 return false;
2693 }
2694 }
2695
2696 return true;
2697 }
2698
getMemOperandsWithOffsetWidth(const MachineInstr & LdSt,SmallVectorImpl<const MachineOperand * > & BaseOps,int64_t & Offset,bool & OffsetIsScalable,LocationSize & Width,const TargetRegisterInfo * TRI) const2699 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2700 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2701 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2702 const TargetRegisterInfo *TRI) const {
2703 if (!LdSt.mayLoadOrStore())
2704 return false;
2705
2706 const MachineOperand *BaseOp;
2707 TypeSize WidthN(0, false);
2708 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
2709 WidthN, TRI))
2710 return false;
2711 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2712 // vector.
2713 Width = LocationSize::precise(WidthN);
2714 BaseOps.push_back(BaseOp);
2715 return true;
2716 }
2717
2718 std::optional<ExtAddrMode>
getAddrModeFromMemoryOp(const MachineInstr & MemI,const TargetRegisterInfo * TRI) const2719 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2720 const TargetRegisterInfo *TRI) const {
2721 const MachineOperand *Base; // Filled with the base operand of MI.
2722 int64_t Offset; // Filled with the offset of MI.
2723 bool OffsetIsScalable;
2724 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2725 return std::nullopt;
2726
2727 if (!Base->isReg())
2728 return std::nullopt;
2729 ExtAddrMode AM;
2730 AM.BaseReg = Base->getReg();
2731 AM.Displacement = Offset;
2732 AM.ScaledReg = 0;
2733 AM.Scale = 0;
2734 return AM;
2735 }
2736
canFoldIntoAddrMode(const MachineInstr & MemI,Register Reg,const MachineInstr & AddrI,ExtAddrMode & AM) const2737 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2738 Register Reg,
2739 const MachineInstr &AddrI,
2740 ExtAddrMode &AM) const {
2741 // Filter out instructions into which we cannot fold.
2742 unsigned NumBytes;
2743 int64_t OffsetScale = 1;
2744 switch (MemI.getOpcode()) {
2745 default:
2746 return false;
2747
2748 case AArch64::LDURQi:
2749 case AArch64::STURQi:
2750 NumBytes = 16;
2751 break;
2752
2753 case AArch64::LDURDi:
2754 case AArch64::STURDi:
2755 case AArch64::LDURXi:
2756 case AArch64::STURXi:
2757 NumBytes = 8;
2758 break;
2759
2760 case AArch64::LDURWi:
2761 case AArch64::LDURSWi:
2762 case AArch64::STURWi:
2763 NumBytes = 4;
2764 break;
2765
2766 case AArch64::LDURHi:
2767 case AArch64::STURHi:
2768 case AArch64::LDURHHi:
2769 case AArch64::STURHHi:
2770 case AArch64::LDURSHXi:
2771 case AArch64::LDURSHWi:
2772 NumBytes = 2;
2773 break;
2774
2775 case AArch64::LDRBroX:
2776 case AArch64::LDRBBroX:
2777 case AArch64::LDRSBXroX:
2778 case AArch64::LDRSBWroX:
2779 case AArch64::STRBroX:
2780 case AArch64::STRBBroX:
2781 case AArch64::LDURBi:
2782 case AArch64::LDURBBi:
2783 case AArch64::LDURSBXi:
2784 case AArch64::LDURSBWi:
2785 case AArch64::STURBi:
2786 case AArch64::STURBBi:
2787 case AArch64::LDRBui:
2788 case AArch64::LDRBBui:
2789 case AArch64::LDRSBXui:
2790 case AArch64::LDRSBWui:
2791 case AArch64::STRBui:
2792 case AArch64::STRBBui:
2793 NumBytes = 1;
2794 break;
2795
2796 case AArch64::LDRQroX:
2797 case AArch64::STRQroX:
2798 case AArch64::LDRQui:
2799 case AArch64::STRQui:
2800 NumBytes = 16;
2801 OffsetScale = 16;
2802 break;
2803
2804 case AArch64::LDRDroX:
2805 case AArch64::STRDroX:
2806 case AArch64::LDRXroX:
2807 case AArch64::STRXroX:
2808 case AArch64::LDRDui:
2809 case AArch64::STRDui:
2810 case AArch64::LDRXui:
2811 case AArch64::STRXui:
2812 NumBytes = 8;
2813 OffsetScale = 8;
2814 break;
2815
2816 case AArch64::LDRWroX:
2817 case AArch64::LDRSWroX:
2818 case AArch64::STRWroX:
2819 case AArch64::LDRWui:
2820 case AArch64::LDRSWui:
2821 case AArch64::STRWui:
2822 NumBytes = 4;
2823 OffsetScale = 4;
2824 break;
2825
2826 case AArch64::LDRHroX:
2827 case AArch64::STRHroX:
2828 case AArch64::LDRHHroX:
2829 case AArch64::STRHHroX:
2830 case AArch64::LDRSHXroX:
2831 case AArch64::LDRSHWroX:
2832 case AArch64::LDRHui:
2833 case AArch64::STRHui:
2834 case AArch64::LDRHHui:
2835 case AArch64::STRHHui:
2836 case AArch64::LDRSHXui:
2837 case AArch64::LDRSHWui:
2838 NumBytes = 2;
2839 OffsetScale = 2;
2840 break;
2841 }
2842
2843 // Check the fold operand is not the loaded/stored value.
2844 const MachineOperand &BaseRegOp = MemI.getOperand(0);
2845 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2846 return false;
2847
2848 // Handle memory instructions with a [Reg, Reg] addressing mode.
2849 if (MemI.getOperand(2).isReg()) {
2850 // Bail if the addressing mode already includes extension of the offset
2851 // register.
2852 if (MemI.getOperand(3).getImm())
2853 return false;
2854
2855 // Check if we actually have a scaled offset.
2856 if (MemI.getOperand(4).getImm() == 0)
2857 OffsetScale = 1;
2858
2859 // If the address instructions is folded into the base register, then the
2860 // addressing mode must not have a scale. Then we can swap the base and the
2861 // scaled registers.
2862 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1)
2863 return false;
2864
2865 switch (AddrI.getOpcode()) {
2866 default:
2867 return false;
2868
2869 case AArch64::SBFMXri:
2870 // sxtw Xa, Wm
2871 // ldr Xd, [Xn, Xa, lsl #N]
2872 // ->
2873 // ldr Xd, [Xn, Wm, sxtw #N]
2874 if (AddrI.getOperand(2).getImm() != 0 ||
2875 AddrI.getOperand(3).getImm() != 31)
2876 return false;
2877
2878 AM.BaseReg = MemI.getOperand(1).getReg();
2879 if (AM.BaseReg == Reg)
2880 AM.BaseReg = MemI.getOperand(2).getReg();
2881 AM.ScaledReg = AddrI.getOperand(1).getReg();
2882 AM.Scale = OffsetScale;
2883 AM.Displacement = 0;
2884 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2885 return true;
2886
2887 case TargetOpcode::SUBREG_TO_REG: {
2888 // mov Wa, Wm
2889 // ldr Xd, [Xn, Xa, lsl #N]
2890 // ->
2891 // ldr Xd, [Xn, Wm, uxtw #N]
2892
2893 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2894 if (AddrI.getOperand(1).getImm() != 0 ||
2895 AddrI.getOperand(3).getImm() != AArch64::sub_32)
2896 return false;
2897
2898 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2899 Register OffsetReg = AddrI.getOperand(2).getReg();
2900 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg))
2901 return false;
2902
2903 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg);
2904 if (DefMI.getOpcode() != AArch64::ORRWrs ||
2905 DefMI.getOperand(1).getReg() != AArch64::WZR ||
2906 DefMI.getOperand(3).getImm() != 0)
2907 return false;
2908
2909 AM.BaseReg = MemI.getOperand(1).getReg();
2910 if (AM.BaseReg == Reg)
2911 AM.BaseReg = MemI.getOperand(2).getReg();
2912 AM.ScaledReg = DefMI.getOperand(2).getReg();
2913 AM.Scale = OffsetScale;
2914 AM.Displacement = 0;
2915 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2916 return true;
2917 }
2918 }
2919 }
2920
2921 // Handle memory instructions with a [Reg, #Imm] addressing mode.
2922
2923 // Check we are not breaking a potential conversion to an LDP.
2924 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2925 int64_t NewOffset) -> bool {
2926 int64_t MinOffset, MaxOffset;
2927 switch (NumBytes) {
2928 default:
2929 return true;
2930 case 4:
2931 MinOffset = -256;
2932 MaxOffset = 252;
2933 break;
2934 case 8:
2935 MinOffset = -512;
2936 MaxOffset = 504;
2937 break;
2938 case 16:
2939 MinOffset = -1024;
2940 MaxOffset = 1008;
2941 break;
2942 }
2943 return OldOffset < MinOffset || OldOffset > MaxOffset ||
2944 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2945 };
2946 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2947 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale;
2948 int64_t NewOffset = OldOffset + Disp;
2949 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0))
2950 return false;
2951 // If the old offset would fit into an LDP, but the new offset wouldn't,
2952 // bail out.
2953 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2954 return false;
2955 AM.BaseReg = AddrI.getOperand(1).getReg();
2956 AM.ScaledReg = 0;
2957 AM.Scale = 0;
2958 AM.Displacement = NewOffset;
2959 AM.Form = ExtAddrMode::Formula::Basic;
2960 return true;
2961 };
2962
2963 auto canFoldAddRegIntoAddrMode =
2964 [&](int64_t Scale,
2965 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2966 if (MemI.getOperand(2).getImm() != 0)
2967 return false;
2968 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2969 return false;
2970 AM.BaseReg = AddrI.getOperand(1).getReg();
2971 AM.ScaledReg = AddrI.getOperand(2).getReg();
2972 AM.Scale = Scale;
2973 AM.Displacement = 0;
2974 AM.Form = Form;
2975 return true;
2976 };
2977
2978 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2979 unsigned Opcode = MemI.getOpcode();
2980 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2981 Subtarget.isSTRQroSlow();
2982 };
2983
2984 int64_t Disp = 0;
2985 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2986 switch (AddrI.getOpcode()) {
2987 default:
2988 return false;
2989
2990 case AArch64::ADDXri:
2991 // add Xa, Xn, #N
2992 // ldr Xd, [Xa, #M]
2993 // ->
2994 // ldr Xd, [Xn, #N'+M]
2995 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
2996 return canFoldAddSubImmIntoAddrMode(Disp);
2997
2998 case AArch64::SUBXri:
2999 // sub Xa, Xn, #N
3000 // ldr Xd, [Xa, #M]
3001 // ->
3002 // ldr Xd, [Xn, #N'+M]
3003 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm();
3004 return canFoldAddSubImmIntoAddrMode(-Disp);
3005
3006 case AArch64::ADDXrs: {
3007 // add Xa, Xn, Xm, lsl #N
3008 // ldr Xd, [Xa]
3009 // ->
3010 // ldr Xd, [Xn, Xm, lsl #N]
3011
3012 // Don't fold the add if the result would be slower, unless optimising for
3013 // size.
3014 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3015 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL)
3016 return false;
3017 Shift = AArch64_AM::getShiftValue(Shift);
3018 if (!OptSize) {
3019 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3020 return false;
3021 if (avoidSlowSTRQ(MemI))
3022 return false;
3023 }
3024 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3025 }
3026
3027 case AArch64::ADDXrr:
3028 // add Xa, Xn, Xm
3029 // ldr Xd, [Xa]
3030 // ->
3031 // ldr Xd, [Xn, Xm, lsl #0]
3032
3033 // Don't fold the add if the result would be slower, unless optimising for
3034 // size.
3035 if (!OptSize && avoidSlowSTRQ(MemI))
3036 return false;
3037 return canFoldAddRegIntoAddrMode(1);
3038
3039 case AArch64::ADDXrx:
3040 // add Xa, Xn, Wm, {s,u}xtw #N
3041 // ldr Xd, [Xa]
3042 // ->
3043 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3044
3045 // Don't fold the add if the result would be slower, unless optimising for
3046 // size.
3047 if (!OptSize && avoidSlowSTRQ(MemI))
3048 return false;
3049
3050 // Can fold only sign-/zero-extend of a word.
3051 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm());
3052 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3053 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3054 return false;
3055
3056 return canFoldAddRegIntoAddrMode(
3057 1ULL << AArch64_AM::getArithShiftValue(Imm),
3058 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3059 : ExtAddrMode::Formula::ZExtScaledReg);
3060 }
3061 }
3062
3063 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3064 // return the opcode of an instruction performing the same operation, but using
3065 // the [Reg, Reg] addressing mode.
regOffsetOpcode(unsigned Opcode)3066 static unsigned regOffsetOpcode(unsigned Opcode) {
3067 switch (Opcode) {
3068 default:
3069 llvm_unreachable("Address folding not implemented for instruction");
3070
3071 case AArch64::LDURQi:
3072 case AArch64::LDRQui:
3073 return AArch64::LDRQroX;
3074 case AArch64::STURQi:
3075 case AArch64::STRQui:
3076 return AArch64::STRQroX;
3077 case AArch64::LDURDi:
3078 case AArch64::LDRDui:
3079 return AArch64::LDRDroX;
3080 case AArch64::STURDi:
3081 case AArch64::STRDui:
3082 return AArch64::STRDroX;
3083 case AArch64::LDURXi:
3084 case AArch64::LDRXui:
3085 return AArch64::LDRXroX;
3086 case AArch64::STURXi:
3087 case AArch64::STRXui:
3088 return AArch64::STRXroX;
3089 case AArch64::LDURWi:
3090 case AArch64::LDRWui:
3091 return AArch64::LDRWroX;
3092 case AArch64::LDURSWi:
3093 case AArch64::LDRSWui:
3094 return AArch64::LDRSWroX;
3095 case AArch64::STURWi:
3096 case AArch64::STRWui:
3097 return AArch64::STRWroX;
3098 case AArch64::LDURHi:
3099 case AArch64::LDRHui:
3100 return AArch64::LDRHroX;
3101 case AArch64::STURHi:
3102 case AArch64::STRHui:
3103 return AArch64::STRHroX;
3104 case AArch64::LDURHHi:
3105 case AArch64::LDRHHui:
3106 return AArch64::LDRHHroX;
3107 case AArch64::STURHHi:
3108 case AArch64::STRHHui:
3109 return AArch64::STRHHroX;
3110 case AArch64::LDURSHXi:
3111 case AArch64::LDRSHXui:
3112 return AArch64::LDRSHXroX;
3113 case AArch64::LDURSHWi:
3114 case AArch64::LDRSHWui:
3115 return AArch64::LDRSHWroX;
3116 case AArch64::LDURBi:
3117 case AArch64::LDRBui:
3118 return AArch64::LDRBroX;
3119 case AArch64::LDURBBi:
3120 case AArch64::LDRBBui:
3121 return AArch64::LDRBBroX;
3122 case AArch64::LDURSBXi:
3123 case AArch64::LDRSBXui:
3124 return AArch64::LDRSBXroX;
3125 case AArch64::LDURSBWi:
3126 case AArch64::LDRSBWui:
3127 return AArch64::LDRSBWroX;
3128 case AArch64::STURBi:
3129 case AArch64::STRBui:
3130 return AArch64::STRBroX;
3131 case AArch64::STURBBi:
3132 case AArch64::STRBBui:
3133 return AArch64::STRBBroX;
3134 }
3135 }
3136
3137 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3138 // the opcode of an instruction performing the same operation, but using the
3139 // [Reg, #Imm] addressing mode with scaled offset.
scaledOffsetOpcode(unsigned Opcode,unsigned & Scale)3140 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3141 switch (Opcode) {
3142 default:
3143 llvm_unreachable("Address folding not implemented for instruction");
3144
3145 case AArch64::LDURQi:
3146 Scale = 16;
3147 return AArch64::LDRQui;
3148 case AArch64::STURQi:
3149 Scale = 16;
3150 return AArch64::STRQui;
3151 case AArch64::LDURDi:
3152 Scale = 8;
3153 return AArch64::LDRDui;
3154 case AArch64::STURDi:
3155 Scale = 8;
3156 return AArch64::STRDui;
3157 case AArch64::LDURXi:
3158 Scale = 8;
3159 return AArch64::LDRXui;
3160 case AArch64::STURXi:
3161 Scale = 8;
3162 return AArch64::STRXui;
3163 case AArch64::LDURWi:
3164 Scale = 4;
3165 return AArch64::LDRWui;
3166 case AArch64::LDURSWi:
3167 Scale = 4;
3168 return AArch64::LDRSWui;
3169 case AArch64::STURWi:
3170 Scale = 4;
3171 return AArch64::STRWui;
3172 case AArch64::LDURHi:
3173 Scale = 2;
3174 return AArch64::LDRHui;
3175 case AArch64::STURHi:
3176 Scale = 2;
3177 return AArch64::STRHui;
3178 case AArch64::LDURHHi:
3179 Scale = 2;
3180 return AArch64::LDRHHui;
3181 case AArch64::STURHHi:
3182 Scale = 2;
3183 return AArch64::STRHHui;
3184 case AArch64::LDURSHXi:
3185 Scale = 2;
3186 return AArch64::LDRSHXui;
3187 case AArch64::LDURSHWi:
3188 Scale = 2;
3189 return AArch64::LDRSHWui;
3190 case AArch64::LDURBi:
3191 Scale = 1;
3192 return AArch64::LDRBui;
3193 case AArch64::LDURBBi:
3194 Scale = 1;
3195 return AArch64::LDRBBui;
3196 case AArch64::LDURSBXi:
3197 Scale = 1;
3198 return AArch64::LDRSBXui;
3199 case AArch64::LDURSBWi:
3200 Scale = 1;
3201 return AArch64::LDRSBWui;
3202 case AArch64::STURBi:
3203 Scale = 1;
3204 return AArch64::STRBui;
3205 case AArch64::STURBBi:
3206 Scale = 1;
3207 return AArch64::STRBBui;
3208 case AArch64::LDRQui:
3209 case AArch64::STRQui:
3210 Scale = 16;
3211 return Opcode;
3212 case AArch64::LDRDui:
3213 case AArch64::STRDui:
3214 case AArch64::LDRXui:
3215 case AArch64::STRXui:
3216 Scale = 8;
3217 return Opcode;
3218 case AArch64::LDRWui:
3219 case AArch64::LDRSWui:
3220 case AArch64::STRWui:
3221 Scale = 4;
3222 return Opcode;
3223 case AArch64::LDRHui:
3224 case AArch64::STRHui:
3225 case AArch64::LDRHHui:
3226 case AArch64::STRHHui:
3227 case AArch64::LDRSHXui:
3228 case AArch64::LDRSHWui:
3229 Scale = 2;
3230 return Opcode;
3231 case AArch64::LDRBui:
3232 case AArch64::LDRBBui:
3233 case AArch64::LDRSBXui:
3234 case AArch64::LDRSBWui:
3235 case AArch64::STRBui:
3236 case AArch64::STRBBui:
3237 Scale = 1;
3238 return Opcode;
3239 }
3240 }
3241
3242 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3243 // the opcode of an instruction performing the same operation, but using the
3244 // [Reg, #Imm] addressing mode with unscaled offset.
unscaledOffsetOpcode(unsigned Opcode)3245 unsigned unscaledOffsetOpcode(unsigned Opcode) {
3246 switch (Opcode) {
3247 default:
3248 llvm_unreachable("Address folding not implemented for instruction");
3249
3250 case AArch64::LDURQi:
3251 case AArch64::STURQi:
3252 case AArch64::LDURDi:
3253 case AArch64::STURDi:
3254 case AArch64::LDURXi:
3255 case AArch64::STURXi:
3256 case AArch64::LDURWi:
3257 case AArch64::LDURSWi:
3258 case AArch64::STURWi:
3259 case AArch64::LDURHi:
3260 case AArch64::STURHi:
3261 case AArch64::LDURHHi:
3262 case AArch64::STURHHi:
3263 case AArch64::LDURSHXi:
3264 case AArch64::LDURSHWi:
3265 case AArch64::LDURBi:
3266 case AArch64::STURBi:
3267 case AArch64::LDURBBi:
3268 case AArch64::STURBBi:
3269 case AArch64::LDURSBWi:
3270 case AArch64::LDURSBXi:
3271 return Opcode;
3272 case AArch64::LDRQui:
3273 return AArch64::LDURQi;
3274 case AArch64::STRQui:
3275 return AArch64::STURQi;
3276 case AArch64::LDRDui:
3277 return AArch64::LDURDi;
3278 case AArch64::STRDui:
3279 return AArch64::STURDi;
3280 case AArch64::LDRXui:
3281 return AArch64::LDURXi;
3282 case AArch64::STRXui:
3283 return AArch64::STURXi;
3284 case AArch64::LDRWui:
3285 return AArch64::LDURWi;
3286 case AArch64::LDRSWui:
3287 return AArch64::LDURSWi;
3288 case AArch64::STRWui:
3289 return AArch64::STURWi;
3290 case AArch64::LDRHui:
3291 return AArch64::LDURHi;
3292 case AArch64::STRHui:
3293 return AArch64::STURHi;
3294 case AArch64::LDRHHui:
3295 return AArch64::LDURHHi;
3296 case AArch64::STRHHui:
3297 return AArch64::STURHHi;
3298 case AArch64::LDRSHXui:
3299 return AArch64::LDURSHXi;
3300 case AArch64::LDRSHWui:
3301 return AArch64::LDURSHWi;
3302 case AArch64::LDRBBui:
3303 return AArch64::LDURBBi;
3304 case AArch64::LDRBui:
3305 return AArch64::LDURBi;
3306 case AArch64::STRBBui:
3307 return AArch64::STURBBi;
3308 case AArch64::STRBui:
3309 return AArch64::STURBi;
3310 case AArch64::LDRSBWui:
3311 return AArch64::LDURSBWi;
3312 case AArch64::LDRSBXui:
3313 return AArch64::LDURSBXi;
3314 }
3315 }
3316
3317 // Given the opcode of a memory load/store instruction, return the opcode of an
3318 // instruction performing the same operation, but using
3319 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3320 // offset register.
offsetExtendOpcode(unsigned Opcode)3321 static unsigned offsetExtendOpcode(unsigned Opcode) {
3322 switch (Opcode) {
3323 default:
3324 llvm_unreachable("Address folding not implemented for instruction");
3325
3326 case AArch64::LDRQroX:
3327 case AArch64::LDURQi:
3328 case AArch64::LDRQui:
3329 return AArch64::LDRQroW;
3330 case AArch64::STRQroX:
3331 case AArch64::STURQi:
3332 case AArch64::STRQui:
3333 return AArch64::STRQroW;
3334 case AArch64::LDRDroX:
3335 case AArch64::LDURDi:
3336 case AArch64::LDRDui:
3337 return AArch64::LDRDroW;
3338 case AArch64::STRDroX:
3339 case AArch64::STURDi:
3340 case AArch64::STRDui:
3341 return AArch64::STRDroW;
3342 case AArch64::LDRXroX:
3343 case AArch64::LDURXi:
3344 case AArch64::LDRXui:
3345 return AArch64::LDRXroW;
3346 case AArch64::STRXroX:
3347 case AArch64::STURXi:
3348 case AArch64::STRXui:
3349 return AArch64::STRXroW;
3350 case AArch64::LDRWroX:
3351 case AArch64::LDURWi:
3352 case AArch64::LDRWui:
3353 return AArch64::LDRWroW;
3354 case AArch64::LDRSWroX:
3355 case AArch64::LDURSWi:
3356 case AArch64::LDRSWui:
3357 return AArch64::LDRSWroW;
3358 case AArch64::STRWroX:
3359 case AArch64::STURWi:
3360 case AArch64::STRWui:
3361 return AArch64::STRWroW;
3362 case AArch64::LDRHroX:
3363 case AArch64::LDURHi:
3364 case AArch64::LDRHui:
3365 return AArch64::LDRHroW;
3366 case AArch64::STRHroX:
3367 case AArch64::STURHi:
3368 case AArch64::STRHui:
3369 return AArch64::STRHroW;
3370 case AArch64::LDRHHroX:
3371 case AArch64::LDURHHi:
3372 case AArch64::LDRHHui:
3373 return AArch64::LDRHHroW;
3374 case AArch64::STRHHroX:
3375 case AArch64::STURHHi:
3376 case AArch64::STRHHui:
3377 return AArch64::STRHHroW;
3378 case AArch64::LDRSHXroX:
3379 case AArch64::LDURSHXi:
3380 case AArch64::LDRSHXui:
3381 return AArch64::LDRSHXroW;
3382 case AArch64::LDRSHWroX:
3383 case AArch64::LDURSHWi:
3384 case AArch64::LDRSHWui:
3385 return AArch64::LDRSHWroW;
3386 case AArch64::LDRBroX:
3387 case AArch64::LDURBi:
3388 case AArch64::LDRBui:
3389 return AArch64::LDRBroW;
3390 case AArch64::LDRBBroX:
3391 case AArch64::LDURBBi:
3392 case AArch64::LDRBBui:
3393 return AArch64::LDRBBroW;
3394 case AArch64::LDRSBXroX:
3395 case AArch64::LDURSBXi:
3396 case AArch64::LDRSBXui:
3397 return AArch64::LDRSBXroW;
3398 case AArch64::LDRSBWroX:
3399 case AArch64::LDURSBWi:
3400 case AArch64::LDRSBWui:
3401 return AArch64::LDRSBWroW;
3402 case AArch64::STRBroX:
3403 case AArch64::STURBi:
3404 case AArch64::STRBui:
3405 return AArch64::STRBroW;
3406 case AArch64::STRBBroX:
3407 case AArch64::STURBBi:
3408 case AArch64::STRBBui:
3409 return AArch64::STRBBroW;
3410 }
3411 }
3412
emitLdStWithAddr(MachineInstr & MemI,const ExtAddrMode & AM) const3413 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3414 const ExtAddrMode &AM) const {
3415
3416 const DebugLoc &DL = MemI.getDebugLoc();
3417 MachineBasicBlock &MBB = *MemI.getParent();
3418 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3419
3420 if (AM.Form == ExtAddrMode::Formula::Basic) {
3421 if (AM.ScaledReg) {
3422 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3423 unsigned Opcode = regOffsetOpcode(MemI.getOpcode());
3424 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3425 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3426 .addReg(MemI.getOperand(0).getReg(),
3427 MemI.mayLoad() ? RegState::Define : 0)
3428 .addReg(AM.BaseReg)
3429 .addReg(AM.ScaledReg)
3430 .addImm(0)
3431 .addImm(AM.Scale > 1)
3432 .setMemRefs(MemI.memoperands())
3433 .setMIFlags(MemI.getFlags());
3434 return B.getInstr();
3435 }
3436
3437 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3438 "Addressing mode not supported for folding");
3439
3440 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3441 unsigned Scale = 1;
3442 unsigned Opcode = MemI.getOpcode();
3443 if (isInt<9>(AM.Displacement))
3444 Opcode = unscaledOffsetOpcode(Opcode);
3445 else
3446 Opcode = scaledOffsetOpcode(Opcode, Scale);
3447
3448 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3449 .addReg(MemI.getOperand(0).getReg(),
3450 MemI.mayLoad() ? RegState::Define : 0)
3451 .addReg(AM.BaseReg)
3452 .addImm(AM.Displacement / Scale)
3453 .setMemRefs(MemI.memoperands())
3454 .setMIFlags(MemI.getFlags());
3455 return B.getInstr();
3456 }
3457
3458 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3459 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3460 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3461 assert(AM.ScaledReg && !AM.Displacement &&
3462 "Address offset can be a register or an immediate, but not both");
3463 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode());
3464 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3465 // Make sure the offset register is in the correct register class.
3466 Register OffsetReg = AM.ScaledReg;
3467 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg);
3468 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3469 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3470 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3471 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3472 }
3473 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3474 .addReg(MemI.getOperand(0).getReg(),
3475 MemI.mayLoad() ? RegState::Define : 0)
3476 .addReg(AM.BaseReg)
3477 .addReg(OffsetReg)
3478 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3479 .addImm(AM.Scale != 1)
3480 .setMemRefs(MemI.memoperands())
3481 .setMIFlags(MemI.getFlags());
3482
3483 return B.getInstr();
3484 }
3485
3486 llvm_unreachable(
3487 "Function must not be called with an addressing mode it can't handle");
3488 }
3489
getMemOperandWithOffsetWidth(const MachineInstr & LdSt,const MachineOperand * & BaseOp,int64_t & Offset,bool & OffsetIsScalable,TypeSize & Width,const TargetRegisterInfo * TRI) const3490 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3491 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3492 bool &OffsetIsScalable, TypeSize &Width,
3493 const TargetRegisterInfo *TRI) const {
3494 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3495 // Handle only loads/stores with base register followed by immediate offset.
3496 if (LdSt.getNumExplicitOperands() == 3) {
3497 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3498 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) ||
3499 !LdSt.getOperand(2).isImm())
3500 return false;
3501 } else if (LdSt.getNumExplicitOperands() == 4) {
3502 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3503 if (!LdSt.getOperand(1).isReg() ||
3504 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) ||
3505 !LdSt.getOperand(3).isImm())
3506 return false;
3507 } else
3508 return false;
3509
3510 // Get the scaling factor for the instruction and set the width for the
3511 // instruction.
3512 TypeSize Scale(0U, false);
3513 int64_t Dummy1, Dummy2;
3514
3515 // If this returns false, then it's an instruction we don't want to handle.
3516 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
3517 return false;
3518
3519 // Compute the offset. Offset is calculated as the immediate operand
3520 // multiplied by the scaling factor. Unscaled instructions have scaling factor
3521 // set to 1.
3522 if (LdSt.getNumExplicitOperands() == 3) {
3523 BaseOp = &LdSt.getOperand(1);
3524 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue();
3525 } else {
3526 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3527 BaseOp = &LdSt.getOperand(2);
3528 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue();
3529 }
3530 OffsetIsScalable = Scale.isScalable();
3531
3532 if (!BaseOp->isReg() && !BaseOp->isFI())
3533 return false;
3534
3535 return true;
3536 }
3537
3538 MachineOperand &
getMemOpBaseRegImmOfsOffsetOperand(MachineInstr & LdSt) const3539 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3540 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3541 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1);
3542 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3543 return OfsOp;
3544 }
3545
getMemOpInfo(unsigned Opcode,TypeSize & Scale,TypeSize & Width,int64_t & MinOffset,int64_t & MaxOffset)3546 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3547 TypeSize &Width, int64_t &MinOffset,
3548 int64_t &MaxOffset) {
3549 switch (Opcode) {
3550 // Not a memory operation or something we want to handle.
3551 default:
3552 Scale = TypeSize::getFixed(0);
3553 Width = TypeSize::getFixed(0);
3554 MinOffset = MaxOffset = 0;
3555 return false;
3556 // LDR / STR
3557 case AArch64::LDRQui:
3558 case AArch64::STRQui:
3559 Scale = TypeSize::getFixed(16);
3560 Width = TypeSize::getFixed(16);
3561 MinOffset = 0;
3562 MaxOffset = 4095;
3563 break;
3564 case AArch64::LDRXui:
3565 case AArch64::LDRDui:
3566 case AArch64::STRXui:
3567 case AArch64::STRDui:
3568 case AArch64::PRFMui:
3569 Scale = TypeSize::getFixed(8);
3570 Width = TypeSize::getFixed(8);
3571 MinOffset = 0;
3572 MaxOffset = 4095;
3573 break;
3574 case AArch64::LDRWui:
3575 case AArch64::LDRSui:
3576 case AArch64::LDRSWui:
3577 case AArch64::STRWui:
3578 case AArch64::STRSui:
3579 Scale = TypeSize::getFixed(4);
3580 Width = TypeSize::getFixed(4);
3581 MinOffset = 0;
3582 MaxOffset = 4095;
3583 break;
3584 case AArch64::LDRHui:
3585 case AArch64::LDRHHui:
3586 case AArch64::LDRSHWui:
3587 case AArch64::LDRSHXui:
3588 case AArch64::STRHui:
3589 case AArch64::STRHHui:
3590 Scale = TypeSize::getFixed(2);
3591 Width = TypeSize::getFixed(2);
3592 MinOffset = 0;
3593 MaxOffset = 4095;
3594 break;
3595 case AArch64::LDRBui:
3596 case AArch64::LDRBBui:
3597 case AArch64::LDRSBWui:
3598 case AArch64::LDRSBXui:
3599 case AArch64::STRBui:
3600 case AArch64::STRBBui:
3601 Scale = TypeSize::getFixed(1);
3602 Width = TypeSize::getFixed(1);
3603 MinOffset = 0;
3604 MaxOffset = 4095;
3605 break;
3606 // post/pre inc
3607 case AArch64::STRQpre:
3608 case AArch64::LDRQpost:
3609 Scale = TypeSize::getFixed(1);
3610 Width = TypeSize::getFixed(16);
3611 MinOffset = -256;
3612 MaxOffset = 255;
3613 break;
3614 case AArch64::STRXpre:
3615 case AArch64::STRDpre:
3616 case AArch64::LDRXpost:
3617 case AArch64::LDRDpost:
3618 Scale = TypeSize::getFixed(1);
3619 Width = TypeSize::getFixed(8);
3620 MinOffset = -256;
3621 MaxOffset = 255;
3622 break;
3623 case AArch64::STRWpost:
3624 case AArch64::LDRWpost:
3625 Scale = TypeSize::getFixed(4);
3626 Width = TypeSize::getFixed(32);
3627 MinOffset = -256;
3628 MaxOffset = 255;
3629 break;
3630 // Unscaled
3631 case AArch64::LDURQi:
3632 case AArch64::STURQi:
3633 Scale = TypeSize::getFixed(1);
3634 Width = TypeSize::getFixed(16);
3635 MinOffset = -256;
3636 MaxOffset = 255;
3637 break;
3638 case AArch64::LDURXi:
3639 case AArch64::LDURDi:
3640 case AArch64::LDAPURXi:
3641 case AArch64::STURXi:
3642 case AArch64::STURDi:
3643 case AArch64::STLURXi:
3644 case AArch64::PRFUMi:
3645 Scale = TypeSize::getFixed(1);
3646 Width = TypeSize::getFixed(8);
3647 MinOffset = -256;
3648 MaxOffset = 255;
3649 break;
3650 case AArch64::LDURWi:
3651 case AArch64::LDURSi:
3652 case AArch64::LDURSWi:
3653 case AArch64::LDAPURi:
3654 case AArch64::LDAPURSWi:
3655 case AArch64::STURWi:
3656 case AArch64::STURSi:
3657 case AArch64::STLURWi:
3658 Scale = TypeSize::getFixed(1);
3659 Width = TypeSize::getFixed(4);
3660 MinOffset = -256;
3661 MaxOffset = 255;
3662 break;
3663 case AArch64::LDURHi:
3664 case AArch64::LDURHHi:
3665 case AArch64::LDURSHXi:
3666 case AArch64::LDURSHWi:
3667 case AArch64::LDAPURHi:
3668 case AArch64::LDAPURSHWi:
3669 case AArch64::LDAPURSHXi:
3670 case AArch64::STURHi:
3671 case AArch64::STURHHi:
3672 case AArch64::STLURHi:
3673 Scale = TypeSize::getFixed(1);
3674 Width = TypeSize::getFixed(2);
3675 MinOffset = -256;
3676 MaxOffset = 255;
3677 break;
3678 case AArch64::LDURBi:
3679 case AArch64::LDURBBi:
3680 case AArch64::LDURSBXi:
3681 case AArch64::LDURSBWi:
3682 case AArch64::LDAPURBi:
3683 case AArch64::LDAPURSBWi:
3684 case AArch64::LDAPURSBXi:
3685 case AArch64::STURBi:
3686 case AArch64::STURBBi:
3687 case AArch64::STLURBi:
3688 Scale = TypeSize::getFixed(1);
3689 Width = TypeSize::getFixed(1);
3690 MinOffset = -256;
3691 MaxOffset = 255;
3692 break;
3693 // LDP / STP
3694 case AArch64::LDPQi:
3695 case AArch64::LDNPQi:
3696 case AArch64::STPQi:
3697 case AArch64::STNPQi:
3698 Scale = TypeSize::getFixed(16);
3699 Width = TypeSize::getFixed(32);
3700 MinOffset = -64;
3701 MaxOffset = 63;
3702 break;
3703 case AArch64::LDPXi:
3704 case AArch64::LDPDi:
3705 case AArch64::LDNPXi:
3706 case AArch64::LDNPDi:
3707 case AArch64::STPXi:
3708 case AArch64::STPDi:
3709 case AArch64::STNPXi:
3710 case AArch64::STNPDi:
3711 Scale = TypeSize::getFixed(8);
3712 Width = TypeSize::getFixed(16);
3713 MinOffset = -64;
3714 MaxOffset = 63;
3715 break;
3716 case AArch64::LDPWi:
3717 case AArch64::LDPSi:
3718 case AArch64::LDNPWi:
3719 case AArch64::LDNPSi:
3720 case AArch64::STPWi:
3721 case AArch64::STPSi:
3722 case AArch64::STNPWi:
3723 case AArch64::STNPSi:
3724 Scale = TypeSize::getFixed(4);
3725 Width = TypeSize::getFixed(8);
3726 MinOffset = -64;
3727 MaxOffset = 63;
3728 break;
3729 // pre/post inc
3730 case AArch64::STPQpre:
3731 case AArch64::LDPQpost:
3732 Scale = TypeSize::getFixed(16);
3733 Width = TypeSize::getFixed(16);
3734 MinOffset = -1024;
3735 MaxOffset = 1008;
3736 break;
3737 case AArch64::STPXpre:
3738 case AArch64::LDPXpost:
3739 case AArch64::STPDpre:
3740 case AArch64::LDPDpost:
3741 Scale = TypeSize::getFixed(8);
3742 Width = TypeSize::getFixed(8);
3743 MinOffset = -512;
3744 MaxOffset = 504;
3745 break;
3746 case AArch64::StoreSwiftAsyncContext:
3747 // Store is an STRXui, but there might be an ADDXri in the expansion too.
3748 Scale = TypeSize::getFixed(1);
3749 Width = TypeSize::getFixed(8);
3750 MinOffset = 0;
3751 MaxOffset = 4095;
3752 break;
3753 case AArch64::ADDG:
3754 Scale = TypeSize::getFixed(16);
3755 Width = TypeSize::getFixed(0);
3756 MinOffset = 0;
3757 MaxOffset = 63;
3758 break;
3759 case AArch64::TAGPstack:
3760 Scale = TypeSize::getFixed(16);
3761 Width = TypeSize::getFixed(0);
3762 // TAGP with a negative offset turns into SUBP, which has a maximum offset
3763 // of 63 (not 64!).
3764 MinOffset = -63;
3765 MaxOffset = 63;
3766 break;
3767 case AArch64::LDG:
3768 case AArch64::STGi:
3769 case AArch64::STZGi:
3770 Scale = TypeSize::getFixed(16);
3771 Width = TypeSize::getFixed(16);
3772 MinOffset = -256;
3773 MaxOffset = 255;
3774 break;
3775 // SVE
3776 case AArch64::STR_ZZZZXI:
3777 case AArch64::LDR_ZZZZXI:
3778 Scale = TypeSize::getScalable(16);
3779 Width = TypeSize::getScalable(16 * 4);
3780 MinOffset = -256;
3781 MaxOffset = 252;
3782 break;
3783 case AArch64::STR_ZZZXI:
3784 case AArch64::LDR_ZZZXI:
3785 Scale = TypeSize::getScalable(16);
3786 Width = TypeSize::getScalable(16 * 3);
3787 MinOffset = -256;
3788 MaxOffset = 253;
3789 break;
3790 case AArch64::STR_ZZXI:
3791 case AArch64::LDR_ZZXI:
3792 Scale = TypeSize::getScalable(16);
3793 Width = TypeSize::getScalable(16 * 2);
3794 MinOffset = -256;
3795 MaxOffset = 254;
3796 break;
3797 case AArch64::LDR_PXI:
3798 case AArch64::STR_PXI:
3799 Scale = TypeSize::getScalable(2);
3800 Width = TypeSize::getScalable(2);
3801 MinOffset = -256;
3802 MaxOffset = 255;
3803 break;
3804 case AArch64::LDR_PPXI:
3805 case AArch64::STR_PPXI:
3806 Scale = TypeSize::getScalable(2);
3807 Width = TypeSize::getScalable(2 * 2);
3808 MinOffset = -256;
3809 MaxOffset = 254;
3810 break;
3811 case AArch64::LDR_ZXI:
3812 case AArch64::STR_ZXI:
3813 Scale = TypeSize::getScalable(16);
3814 Width = TypeSize::getScalable(16);
3815 MinOffset = -256;
3816 MaxOffset = 255;
3817 break;
3818 case AArch64::LD1B_IMM:
3819 case AArch64::LD1H_IMM:
3820 case AArch64::LD1W_IMM:
3821 case AArch64::LD1D_IMM:
3822 case AArch64::LDNT1B_ZRI:
3823 case AArch64::LDNT1H_ZRI:
3824 case AArch64::LDNT1W_ZRI:
3825 case AArch64::LDNT1D_ZRI:
3826 case AArch64::ST1B_IMM:
3827 case AArch64::ST1H_IMM:
3828 case AArch64::ST1W_IMM:
3829 case AArch64::ST1D_IMM:
3830 case AArch64::STNT1B_ZRI:
3831 case AArch64::STNT1H_ZRI:
3832 case AArch64::STNT1W_ZRI:
3833 case AArch64::STNT1D_ZRI:
3834 case AArch64::LDNF1B_IMM:
3835 case AArch64::LDNF1H_IMM:
3836 case AArch64::LDNF1W_IMM:
3837 case AArch64::LDNF1D_IMM:
3838 // A full vectors worth of data
3839 // Width = mbytes * elements
3840 Scale = TypeSize::getScalable(16);
3841 Width = TypeSize::getScalable(16);
3842 MinOffset = -8;
3843 MaxOffset = 7;
3844 break;
3845 case AArch64::LD2B_IMM:
3846 case AArch64::LD2H_IMM:
3847 case AArch64::LD2W_IMM:
3848 case AArch64::LD2D_IMM:
3849 case AArch64::ST2B_IMM:
3850 case AArch64::ST2H_IMM:
3851 case AArch64::ST2W_IMM:
3852 case AArch64::ST2D_IMM:
3853 Scale = TypeSize::getScalable(32);
3854 Width = TypeSize::getScalable(16 * 2);
3855 MinOffset = -8;
3856 MaxOffset = 7;
3857 break;
3858 case AArch64::LD3B_IMM:
3859 case AArch64::LD3H_IMM:
3860 case AArch64::LD3W_IMM:
3861 case AArch64::LD3D_IMM:
3862 case AArch64::ST3B_IMM:
3863 case AArch64::ST3H_IMM:
3864 case AArch64::ST3W_IMM:
3865 case AArch64::ST3D_IMM:
3866 Scale = TypeSize::getScalable(48);
3867 Width = TypeSize::getScalable(16 * 3);
3868 MinOffset = -8;
3869 MaxOffset = 7;
3870 break;
3871 case AArch64::LD4B_IMM:
3872 case AArch64::LD4H_IMM:
3873 case AArch64::LD4W_IMM:
3874 case AArch64::LD4D_IMM:
3875 case AArch64::ST4B_IMM:
3876 case AArch64::ST4H_IMM:
3877 case AArch64::ST4W_IMM:
3878 case AArch64::ST4D_IMM:
3879 Scale = TypeSize::getScalable(64);
3880 Width = TypeSize::getScalable(16 * 4);
3881 MinOffset = -8;
3882 MaxOffset = 7;
3883 break;
3884 case AArch64::LD1B_H_IMM:
3885 case AArch64::LD1SB_H_IMM:
3886 case AArch64::LD1H_S_IMM:
3887 case AArch64::LD1SH_S_IMM:
3888 case AArch64::LD1W_D_IMM:
3889 case AArch64::LD1SW_D_IMM:
3890 case AArch64::ST1B_H_IMM:
3891 case AArch64::ST1H_S_IMM:
3892 case AArch64::ST1W_D_IMM:
3893 case AArch64::LDNF1B_H_IMM:
3894 case AArch64::LDNF1SB_H_IMM:
3895 case AArch64::LDNF1H_S_IMM:
3896 case AArch64::LDNF1SH_S_IMM:
3897 case AArch64::LDNF1W_D_IMM:
3898 case AArch64::LDNF1SW_D_IMM:
3899 // A half vector worth of data
3900 // Width = mbytes * elements
3901 Scale = TypeSize::getScalable(8);
3902 Width = TypeSize::getScalable(8);
3903 MinOffset = -8;
3904 MaxOffset = 7;
3905 break;
3906 case AArch64::LD1B_S_IMM:
3907 case AArch64::LD1SB_S_IMM:
3908 case AArch64::LD1H_D_IMM:
3909 case AArch64::LD1SH_D_IMM:
3910 case AArch64::ST1B_S_IMM:
3911 case AArch64::ST1H_D_IMM:
3912 case AArch64::LDNF1B_S_IMM:
3913 case AArch64::LDNF1SB_S_IMM:
3914 case AArch64::LDNF1H_D_IMM:
3915 case AArch64::LDNF1SH_D_IMM:
3916 // A quarter vector worth of data
3917 // Width = mbytes * elements
3918 Scale = TypeSize::getScalable(4);
3919 Width = TypeSize::getScalable(4);
3920 MinOffset = -8;
3921 MaxOffset = 7;
3922 break;
3923 case AArch64::LD1B_D_IMM:
3924 case AArch64::LD1SB_D_IMM:
3925 case AArch64::ST1B_D_IMM:
3926 case AArch64::LDNF1B_D_IMM:
3927 case AArch64::LDNF1SB_D_IMM:
3928 // A eighth vector worth of data
3929 // Width = mbytes * elements
3930 Scale = TypeSize::getScalable(2);
3931 Width = TypeSize::getScalable(2);
3932 MinOffset = -8;
3933 MaxOffset = 7;
3934 break;
3935 case AArch64::ST2Gi:
3936 case AArch64::STZ2Gi:
3937 Scale = TypeSize::getFixed(16);
3938 Width = TypeSize::getFixed(32);
3939 MinOffset = -256;
3940 MaxOffset = 255;
3941 break;
3942 case AArch64::STGPi:
3943 Scale = TypeSize::getFixed(16);
3944 Width = TypeSize::getFixed(16);
3945 MinOffset = -64;
3946 MaxOffset = 63;
3947 break;
3948 case AArch64::LD1RB_IMM:
3949 case AArch64::LD1RB_H_IMM:
3950 case AArch64::LD1RB_S_IMM:
3951 case AArch64::LD1RB_D_IMM:
3952 case AArch64::LD1RSB_H_IMM:
3953 case AArch64::LD1RSB_S_IMM:
3954 case AArch64::LD1RSB_D_IMM:
3955 Scale = TypeSize::getFixed(1);
3956 Width = TypeSize::getFixed(1);
3957 MinOffset = 0;
3958 MaxOffset = 63;
3959 break;
3960 case AArch64::LD1RH_IMM:
3961 case AArch64::LD1RH_S_IMM:
3962 case AArch64::LD1RH_D_IMM:
3963 case AArch64::LD1RSH_S_IMM:
3964 case AArch64::LD1RSH_D_IMM:
3965 Scale = TypeSize::getFixed(2);
3966 Width = TypeSize::getFixed(2);
3967 MinOffset = 0;
3968 MaxOffset = 63;
3969 break;
3970 case AArch64::LD1RW_IMM:
3971 case AArch64::LD1RW_D_IMM:
3972 case AArch64::LD1RSW_IMM:
3973 Scale = TypeSize::getFixed(4);
3974 Width = TypeSize::getFixed(4);
3975 MinOffset = 0;
3976 MaxOffset = 63;
3977 break;
3978 case AArch64::LD1RD_IMM:
3979 Scale = TypeSize::getFixed(8);
3980 Width = TypeSize::getFixed(8);
3981 MinOffset = 0;
3982 MaxOffset = 63;
3983 break;
3984 }
3985
3986 return true;
3987 }
3988
3989 // Scaling factor for unscaled load or store.
getMemScale(unsigned Opc)3990 int AArch64InstrInfo::getMemScale(unsigned Opc) {
3991 switch (Opc) {
3992 default:
3993 llvm_unreachable("Opcode has unknown scale!");
3994 case AArch64::LDRBBui:
3995 case AArch64::LDURBBi:
3996 case AArch64::LDRSBWui:
3997 case AArch64::LDURSBWi:
3998 case AArch64::STRBBui:
3999 case AArch64::STURBBi:
4000 return 1;
4001 case AArch64::LDRHHui:
4002 case AArch64::LDURHHi:
4003 case AArch64::LDRSHWui:
4004 case AArch64::LDURSHWi:
4005 case AArch64::STRHHui:
4006 case AArch64::STURHHi:
4007 return 2;
4008 case AArch64::LDRSui:
4009 case AArch64::LDURSi:
4010 case AArch64::LDRSpre:
4011 case AArch64::LDRSWui:
4012 case AArch64::LDURSWi:
4013 case AArch64::LDRSWpre:
4014 case AArch64::LDRWpre:
4015 case AArch64::LDRWui:
4016 case AArch64::LDURWi:
4017 case AArch64::STRSui:
4018 case AArch64::STURSi:
4019 case AArch64::STRSpre:
4020 case AArch64::STRWui:
4021 case AArch64::STURWi:
4022 case AArch64::STRWpre:
4023 case AArch64::LDPSi:
4024 case AArch64::LDPSWi:
4025 case AArch64::LDPWi:
4026 case AArch64::STPSi:
4027 case AArch64::STPWi:
4028 return 4;
4029 case AArch64::LDRDui:
4030 case AArch64::LDURDi:
4031 case AArch64::LDRDpre:
4032 case AArch64::LDRXui:
4033 case AArch64::LDURXi:
4034 case AArch64::LDRXpre:
4035 case AArch64::STRDui:
4036 case AArch64::STURDi:
4037 case AArch64::STRDpre:
4038 case AArch64::STRXui:
4039 case AArch64::STURXi:
4040 case AArch64::STRXpre:
4041 case AArch64::LDPDi:
4042 case AArch64::LDPXi:
4043 case AArch64::STPDi:
4044 case AArch64::STPXi:
4045 return 8;
4046 case AArch64::LDRQui:
4047 case AArch64::LDURQi:
4048 case AArch64::STRQui:
4049 case AArch64::STURQi:
4050 case AArch64::STRQpre:
4051 case AArch64::LDPQi:
4052 case AArch64::LDRQpre:
4053 case AArch64::STPQi:
4054 case AArch64::STGi:
4055 case AArch64::STZGi:
4056 case AArch64::ST2Gi:
4057 case AArch64::STZ2Gi:
4058 case AArch64::STGPi:
4059 return 16;
4060 }
4061 }
4062
isPreLd(const MachineInstr & MI)4063 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4064 switch (MI.getOpcode()) {
4065 default:
4066 return false;
4067 case AArch64::LDRWpre:
4068 case AArch64::LDRXpre:
4069 case AArch64::LDRSWpre:
4070 case AArch64::LDRSpre:
4071 case AArch64::LDRDpre:
4072 case AArch64::LDRQpre:
4073 return true;
4074 }
4075 }
4076
isPreSt(const MachineInstr & MI)4077 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4078 switch (MI.getOpcode()) {
4079 default:
4080 return false;
4081 case AArch64::STRWpre:
4082 case AArch64::STRXpre:
4083 case AArch64::STRSpre:
4084 case AArch64::STRDpre:
4085 case AArch64::STRQpre:
4086 return true;
4087 }
4088 }
4089
isPreLdSt(const MachineInstr & MI)4090 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4091 return isPreLd(MI) || isPreSt(MI);
4092 }
4093
isPairedLdSt(const MachineInstr & MI)4094 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4095 switch (MI.getOpcode()) {
4096 default:
4097 return false;
4098 case AArch64::LDPSi:
4099 case AArch64::LDPSWi:
4100 case AArch64::LDPDi:
4101 case AArch64::LDPQi:
4102 case AArch64::LDPWi:
4103 case AArch64::LDPXi:
4104 case AArch64::STPSi:
4105 case AArch64::STPDi:
4106 case AArch64::STPQi:
4107 case AArch64::STPWi:
4108 case AArch64::STPXi:
4109 case AArch64::STGPi:
4110 return true;
4111 }
4112 }
4113
getLdStBaseOp(const MachineInstr & MI)4114 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4115 unsigned Idx =
4116 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4117 : 1;
4118 return MI.getOperand(Idx);
4119 }
4120
4121 const MachineOperand &
getLdStOffsetOp(const MachineInstr & MI)4122 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4123 unsigned Idx =
4124 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4125 : 2;
4126 return MI.getOperand(Idx);
4127 }
4128
getRegClass(const MachineInstr & MI,Register Reg)4129 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4130 Register Reg) {
4131 if (MI.getParent() == nullptr)
4132 return nullptr;
4133 const MachineFunction *MF = MI.getParent()->getParent();
4134 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4135 }
4136
isHForm(const MachineInstr & MI)4137 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4138 auto IsHFPR = [&](const MachineOperand &Op) {
4139 if (!Op.isReg())
4140 return false;
4141 auto Reg = Op.getReg();
4142 if (Reg.isPhysical())
4143 return AArch64::FPR16RegClass.contains(Reg);
4144 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4145 return TRC == &AArch64::FPR16RegClass ||
4146 TRC == &AArch64::FPR16_loRegClass;
4147 };
4148 return llvm::any_of(MI.operands(), IsHFPR);
4149 }
4150
isQForm(const MachineInstr & MI)4151 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4152 auto IsQFPR = [&](const MachineOperand &Op) {
4153 if (!Op.isReg())
4154 return false;
4155 auto Reg = Op.getReg();
4156 if (Reg.isPhysical())
4157 return AArch64::FPR128RegClass.contains(Reg);
4158 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4159 return TRC == &AArch64::FPR128RegClass ||
4160 TRC == &AArch64::FPR128_loRegClass;
4161 };
4162 return llvm::any_of(MI.operands(), IsQFPR);
4163 }
4164
hasBTISemantics(const MachineInstr & MI)4165 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4166 switch (MI.getOpcode()) {
4167 case AArch64::BRK:
4168 case AArch64::HLT:
4169 case AArch64::PACIASP:
4170 case AArch64::PACIBSP:
4171 // Implicit BTI behavior.
4172 return true;
4173 case AArch64::PAUTH_PROLOGUE:
4174 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4175 return true;
4176 case AArch64::HINT: {
4177 unsigned Imm = MI.getOperand(0).getImm();
4178 // Explicit BTI instruction.
4179 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4180 return true;
4181 // PACI(A|B)SP instructions.
4182 if (Imm == 25 || Imm == 27)
4183 return true;
4184 return false;
4185 }
4186 default:
4187 return false;
4188 }
4189 }
4190
isFpOrNEON(Register Reg)4191 bool AArch64InstrInfo::isFpOrNEON(Register Reg) {
4192 if (Reg == 0)
4193 return false;
4194 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON");
4195 return AArch64::FPR128RegClass.contains(Reg) ||
4196 AArch64::FPR64RegClass.contains(Reg) ||
4197 AArch64::FPR32RegClass.contains(Reg) ||
4198 AArch64::FPR16RegClass.contains(Reg) ||
4199 AArch64::FPR8RegClass.contains(Reg);
4200 }
4201
isFpOrNEON(const MachineInstr & MI)4202 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4203 auto IsFPR = [&](const MachineOperand &Op) {
4204 if (!Op.isReg())
4205 return false;
4206 auto Reg = Op.getReg();
4207 if (Reg.isPhysical())
4208 return isFpOrNEON(Reg);
4209
4210 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4211 return TRC == &AArch64::FPR128RegClass ||
4212 TRC == &AArch64::FPR128_loRegClass ||
4213 TRC == &AArch64::FPR64RegClass ||
4214 TRC == &AArch64::FPR64_loRegClass ||
4215 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4216 TRC == &AArch64::FPR8RegClass;
4217 };
4218 return llvm::any_of(MI.operands(), IsFPR);
4219 }
4220
4221 // Scale the unscaled offsets. Returns false if the unscaled offset can't be
4222 // scaled.
scaleOffset(unsigned Opc,int64_t & Offset)4223 static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4224 int Scale = AArch64InstrInfo::getMemScale(Opc);
4225
4226 // If the byte-offset isn't a multiple of the stride, we can't scale this
4227 // offset.
4228 if (Offset % Scale != 0)
4229 return false;
4230
4231 // Convert the byte-offset used by unscaled into an "element" offset used
4232 // by the scaled pair load/store instructions.
4233 Offset /= Scale;
4234 return true;
4235 }
4236
canPairLdStOpc(unsigned FirstOpc,unsigned SecondOpc)4237 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4238 if (FirstOpc == SecondOpc)
4239 return true;
4240 // We can also pair sign-ext and zero-ext instructions.
4241 switch (FirstOpc) {
4242 default:
4243 return false;
4244 case AArch64::STRSui:
4245 case AArch64::STURSi:
4246 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4247 case AArch64::STRDui:
4248 case AArch64::STURDi:
4249 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4250 case AArch64::STRQui:
4251 case AArch64::STURQi:
4252 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4253 case AArch64::STRWui:
4254 case AArch64::STURWi:
4255 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4256 case AArch64::STRXui:
4257 case AArch64::STURXi:
4258 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4259 case AArch64::LDRSui:
4260 case AArch64::LDURSi:
4261 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4262 case AArch64::LDRDui:
4263 case AArch64::LDURDi:
4264 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4265 case AArch64::LDRQui:
4266 case AArch64::LDURQi:
4267 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4268 case AArch64::LDRWui:
4269 case AArch64::LDURWi:
4270 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4271 case AArch64::LDRSWui:
4272 case AArch64::LDURSWi:
4273 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4274 case AArch64::LDRXui:
4275 case AArch64::LDURXi:
4276 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4277 }
4278 // These instructions can't be paired based on their opcodes.
4279 return false;
4280 }
4281
shouldClusterFI(const MachineFrameInfo & MFI,int FI1,int64_t Offset1,unsigned Opcode1,int FI2,int64_t Offset2,unsigned Opcode2)4282 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4283 int64_t Offset1, unsigned Opcode1, int FI2,
4284 int64_t Offset2, unsigned Opcode2) {
4285 // Accesses through fixed stack object frame indices may access a different
4286 // fixed stack slot. Check that the object offsets + offsets match.
4287 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) {
4288 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
4289 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
4290 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4291 // Convert to scaled object offsets.
4292 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
4293 if (ObjectOffset1 % Scale1 != 0)
4294 return false;
4295 ObjectOffset1 /= Scale1;
4296 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
4297 if (ObjectOffset2 % Scale2 != 0)
4298 return false;
4299 ObjectOffset2 /= Scale2;
4300 ObjectOffset1 += Offset1;
4301 ObjectOffset2 += Offset2;
4302 return ObjectOffset1 + 1 == ObjectOffset2;
4303 }
4304
4305 return FI1 == FI2;
4306 }
4307
4308 /// Detect opportunities for ldp/stp formation.
4309 ///
4310 /// Only called for LdSt for which getMemOperandWithOffset returns true.
shouldClusterMemOps(ArrayRef<const MachineOperand * > BaseOps1,int64_t OpOffset1,bool OffsetIsScalable1,ArrayRef<const MachineOperand * > BaseOps2,int64_t OpOffset2,bool OffsetIsScalable2,unsigned ClusterSize,unsigned NumBytes) const4311 bool AArch64InstrInfo::shouldClusterMemOps(
4312 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4313 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4314 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4315 unsigned NumBytes) const {
4316 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4317 const MachineOperand &BaseOp1 = *BaseOps1.front();
4318 const MachineOperand &BaseOp2 = *BaseOps2.front();
4319 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4320 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4321 if (BaseOp1.getType() != BaseOp2.getType())
4322 return false;
4323
4324 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4325 "Only base registers and frame indices are supported.");
4326
4327 // Check for both base regs and base FI.
4328 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4329 return false;
4330
4331 // Only cluster up to a single pair.
4332 if (ClusterSize > 2)
4333 return false;
4334
4335 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
4336 return false;
4337
4338 // Can we pair these instructions based on their opcodes?
4339 unsigned FirstOpc = FirstLdSt.getOpcode();
4340 unsigned SecondOpc = SecondLdSt.getOpcode();
4341 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4342 return false;
4343
4344 // Can't merge volatiles or load/stores that have a hint to avoid pair
4345 // formation, for example.
4346 if (!isCandidateToMergeOrPair(FirstLdSt) ||
4347 !isCandidateToMergeOrPair(SecondLdSt))
4348 return false;
4349
4350 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4351 int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
4352 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
4353 return false;
4354
4355 int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
4356 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
4357 return false;
4358
4359 // Pairwise instructions have a 7-bit signed offset field.
4360 if (Offset1 > 63 || Offset1 < -64)
4361 return false;
4362
4363 // The caller should already have ordered First/SecondLdSt by offset.
4364 // Note: except for non-equal frame index bases
4365 if (BaseOp1.isFI()) {
4366 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4367 "Caller should have ordered offsets.");
4368
4369 const MachineFrameInfo &MFI =
4370 FirstLdSt.getParent()->getParent()->getFrameInfo();
4371 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc,
4372 BaseOp2.getIndex(), Offset2, SecondOpc);
4373 }
4374
4375 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4376
4377 return Offset1 + 1 == Offset2;
4378 }
4379
AddSubReg(const MachineInstrBuilder & MIB,unsigned Reg,unsigned SubIdx,unsigned State,const TargetRegisterInfo * TRI)4380 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4381 unsigned Reg, unsigned SubIdx,
4382 unsigned State,
4383 const TargetRegisterInfo *TRI) {
4384 if (!SubIdx)
4385 return MIB.addReg(Reg, State);
4386
4387 if (Register::isPhysicalRegister(Reg))
4388 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
4389 return MIB.addReg(Reg, State, SubIdx);
4390 }
4391
forwardCopyWillClobberTuple(unsigned DestReg,unsigned SrcReg,unsigned NumRegs)4392 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4393 unsigned NumRegs) {
4394 // We really want the positive remainder mod 32 here, that happens to be
4395 // easily obtainable with a mask.
4396 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4397 }
4398
copyPhysRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc,unsigned Opcode,ArrayRef<unsigned> Indices) const4399 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4400 MachineBasicBlock::iterator I,
4401 const DebugLoc &DL, MCRegister DestReg,
4402 MCRegister SrcReg, bool KillSrc,
4403 unsigned Opcode,
4404 ArrayRef<unsigned> Indices) const {
4405 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4406 const TargetRegisterInfo *TRI = &getRegisterInfo();
4407 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4408 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4409 unsigned NumRegs = Indices.size();
4410
4411 int SubReg = 0, End = NumRegs, Incr = 1;
4412 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
4413 SubReg = NumRegs - 1;
4414 End = -1;
4415 Incr = -1;
4416 }
4417
4418 for (; SubReg != End; SubReg += Incr) {
4419 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4420 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4421 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
4422 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4423 }
4424 }
4425
copyGPRRegTuple(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,unsigned DestReg,unsigned SrcReg,bool KillSrc,unsigned Opcode,unsigned ZeroReg,llvm::ArrayRef<unsigned> Indices) const4426 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4427 MachineBasicBlock::iterator I,
4428 DebugLoc DL, unsigned DestReg,
4429 unsigned SrcReg, bool KillSrc,
4430 unsigned Opcode, unsigned ZeroReg,
4431 llvm::ArrayRef<unsigned> Indices) const {
4432 const TargetRegisterInfo *TRI = &getRegisterInfo();
4433 unsigned NumRegs = Indices.size();
4434
4435 #ifndef NDEBUG
4436 uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
4437 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
4438 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4439 "GPR reg sequences should not be able to overlap");
4440 #endif
4441
4442 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4443 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4444 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
4445 MIB.addReg(ZeroReg);
4446 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
4447 MIB.addImm(0);
4448 }
4449 }
4450
copyPhysReg(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,MCRegister DestReg,MCRegister SrcReg,bool KillSrc) const4451 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4452 MachineBasicBlock::iterator I,
4453 const DebugLoc &DL, MCRegister DestReg,
4454 MCRegister SrcReg, bool KillSrc) const {
4455 if (AArch64::GPR32spRegClass.contains(DestReg) &&
4456 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4457 const TargetRegisterInfo *TRI = &getRegisterInfo();
4458
4459 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4460 // If either operand is WSP, expand to ADD #0.
4461 if (Subtarget.hasZeroCycleRegMove()) {
4462 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4463 MCRegister DestRegX = TRI->getMatchingSuperReg(
4464 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4465 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4466 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4467 // This instruction is reading and writing X registers. This may upset
4468 // the register scavenger and machine verifier, so we need to indicate
4469 // that we are reading an undefined value from SrcRegX, but a proper
4470 // value from SrcReg.
4471 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4472 .addReg(SrcRegX, RegState::Undef)
4473 .addImm(0)
4474 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4475 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4476 } else {
4477 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4478 .addReg(SrcReg, getKillRegState(KillSrc))
4479 .addImm(0)
4480 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4481 }
4482 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4483 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4484 .addImm(0)
4485 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4486 } else {
4487 if (Subtarget.hasZeroCycleRegMove()) {
4488 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4489 MCRegister DestRegX = TRI->getMatchingSuperReg(
4490 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4491 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4492 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4493 // This instruction is reading and writing X registers. This may upset
4494 // the register scavenger and machine verifier, so we need to indicate
4495 // that we are reading an undefined value from SrcRegX, but a proper
4496 // value from SrcReg.
4497 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4498 .addReg(AArch64::XZR)
4499 .addReg(SrcRegX, RegState::Undef)
4500 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4501 } else {
4502 // Otherwise, expand to ORR WZR.
4503 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4504 .addReg(AArch64::WZR)
4505 .addReg(SrcReg, getKillRegState(KillSrc));
4506 }
4507 }
4508 return;
4509 }
4510
4511 // Copy a Predicate register by ORRing with itself.
4512 if (AArch64::PPRRegClass.contains(DestReg) &&
4513 AArch64::PPRRegClass.contains(SrcReg)) {
4514 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4515 "Unexpected SVE register.");
4516 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4517 .addReg(SrcReg) // Pg
4518 .addReg(SrcReg)
4519 .addReg(SrcReg, getKillRegState(KillSrc));
4520 return;
4521 }
4522
4523 // Copy a predicate-as-counter register by ORRing with itself as if it
4524 // were a regular predicate (mask) register.
4525 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4526 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4527 if (DestIsPNR || SrcIsPNR) {
4528 auto ToPPR = [](MCRegister R) -> MCRegister {
4529 return (R - AArch64::PN0) + AArch64::P0;
4530 };
4531 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4532 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4533
4534 if (PPRSrcReg != PPRDestReg) {
4535 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4536 .addReg(PPRSrcReg) // Pg
4537 .addReg(PPRSrcReg)
4538 .addReg(PPRSrcReg, getKillRegState(KillSrc));
4539 if (DestIsPNR)
4540 NewMI.addDef(DestReg, RegState::Implicit);
4541 }
4542 return;
4543 }
4544
4545 // Copy a Z register by ORRing with itself.
4546 if (AArch64::ZPRRegClass.contains(DestReg) &&
4547 AArch64::ZPRRegClass.contains(SrcReg)) {
4548 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4549 "Unexpected SVE register.");
4550 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4551 .addReg(SrcReg)
4552 .addReg(SrcReg, getKillRegState(KillSrc));
4553 return;
4554 }
4555
4556 // Copy a Z register pair by copying the individual sub-registers.
4557 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4558 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4559 (AArch64::ZPR2RegClass.contains(SrcReg) ||
4560 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4561 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4562 "Unexpected SVE register.");
4563 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4564 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4565 Indices);
4566 return;
4567 }
4568
4569 // Copy a Z register triple by copying the individual sub-registers.
4570 if (AArch64::ZPR3RegClass.contains(DestReg) &&
4571 AArch64::ZPR3RegClass.contains(SrcReg)) {
4572 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4573 "Unexpected SVE register.");
4574 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4575 AArch64::zsub2};
4576 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4577 Indices);
4578 return;
4579 }
4580
4581 // Copy a Z register quad by copying the individual sub-registers.
4582 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4583 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4584 (AArch64::ZPR4RegClass.contains(SrcReg) ||
4585 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4586 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4587 "Unexpected SVE register.");
4588 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4589 AArch64::zsub2, AArch64::zsub3};
4590 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4591 Indices);
4592 return;
4593 }
4594
4595 if (AArch64::GPR64spRegClass.contains(DestReg) &&
4596 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4597 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4598 // If either operand is SP, expand to ADD #0.
4599 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4600 .addReg(SrcReg, getKillRegState(KillSrc))
4601 .addImm(0)
4602 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4603 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4604 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4605 .addImm(0)
4606 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4607 } else {
4608 // Otherwise, expand to ORR XZR.
4609 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4610 .addReg(AArch64::XZR)
4611 .addReg(SrcReg, getKillRegState(KillSrc));
4612 }
4613 return;
4614 }
4615
4616 // Copy a DDDD register quad by copying the individual sub-registers.
4617 if (AArch64::DDDDRegClass.contains(DestReg) &&
4618 AArch64::DDDDRegClass.contains(SrcReg)) {
4619 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4620 AArch64::dsub2, AArch64::dsub3};
4621 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4622 Indices);
4623 return;
4624 }
4625
4626 // Copy a DDD register triple by copying the individual sub-registers.
4627 if (AArch64::DDDRegClass.contains(DestReg) &&
4628 AArch64::DDDRegClass.contains(SrcReg)) {
4629 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4630 AArch64::dsub2};
4631 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4632 Indices);
4633 return;
4634 }
4635
4636 // Copy a DD register pair by copying the individual sub-registers.
4637 if (AArch64::DDRegClass.contains(DestReg) &&
4638 AArch64::DDRegClass.contains(SrcReg)) {
4639 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4640 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4641 Indices);
4642 return;
4643 }
4644
4645 // Copy a QQQQ register quad by copying the individual sub-registers.
4646 if (AArch64::QQQQRegClass.contains(DestReg) &&
4647 AArch64::QQQQRegClass.contains(SrcReg)) {
4648 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4649 AArch64::qsub2, AArch64::qsub3};
4650 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4651 Indices);
4652 return;
4653 }
4654
4655 // Copy a QQQ register triple by copying the individual sub-registers.
4656 if (AArch64::QQQRegClass.contains(DestReg) &&
4657 AArch64::QQQRegClass.contains(SrcReg)) {
4658 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4659 AArch64::qsub2};
4660 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4661 Indices);
4662 return;
4663 }
4664
4665 // Copy a QQ register pair by copying the individual sub-registers.
4666 if (AArch64::QQRegClass.contains(DestReg) &&
4667 AArch64::QQRegClass.contains(SrcReg)) {
4668 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4669 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4670 Indices);
4671 return;
4672 }
4673
4674 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4675 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4676 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4677 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4678 AArch64::XZR, Indices);
4679 return;
4680 }
4681
4682 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4683 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4684 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4685 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4686 AArch64::WZR, Indices);
4687 return;
4688 }
4689
4690 if (AArch64::FPR128RegClass.contains(DestReg) &&
4691 AArch64::FPR128RegClass.contains(SrcReg)) {
4692 if (Subtarget.isSVEorStreamingSVEAvailable() &&
4693 !Subtarget.isNeonAvailable())
4694 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4695 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4696 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4697 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4698 else if (Subtarget.isNeonAvailable())
4699 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4700 .addReg(SrcReg)
4701 .addReg(SrcReg, getKillRegState(KillSrc));
4702 else {
4703 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4704 .addReg(AArch64::SP, RegState::Define)
4705 .addReg(SrcReg, getKillRegState(KillSrc))
4706 .addReg(AArch64::SP)
4707 .addImm(-16);
4708 BuildMI(MBB, I, DL, get(AArch64::LDRQpost))
4709 .addReg(AArch64::SP, RegState::Define)
4710 .addReg(DestReg, RegState::Define)
4711 .addReg(AArch64::SP)
4712 .addImm(16);
4713 }
4714 return;
4715 }
4716
4717 if (AArch64::FPR64RegClass.contains(DestReg) &&
4718 AArch64::FPR64RegClass.contains(SrcReg)) {
4719 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4720 .addReg(SrcReg, getKillRegState(KillSrc));
4721 return;
4722 }
4723
4724 if (AArch64::FPR32RegClass.contains(DestReg) &&
4725 AArch64::FPR32RegClass.contains(SrcReg)) {
4726 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4727 .addReg(SrcReg, getKillRegState(KillSrc));
4728 return;
4729 }
4730
4731 if (AArch64::FPR16RegClass.contains(DestReg) &&
4732 AArch64::FPR16RegClass.contains(SrcReg)) {
4733 DestReg =
4734 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4735 SrcReg =
4736 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4737 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4738 .addReg(SrcReg, getKillRegState(KillSrc));
4739 return;
4740 }
4741
4742 if (AArch64::FPR8RegClass.contains(DestReg) &&
4743 AArch64::FPR8RegClass.contains(SrcReg)) {
4744 DestReg =
4745 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4746 SrcReg =
4747 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4748 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4749 .addReg(SrcReg, getKillRegState(KillSrc));
4750 return;
4751 }
4752
4753 // Copies between GPR64 and FPR64.
4754 if (AArch64::FPR64RegClass.contains(DestReg) &&
4755 AArch64::GPR64RegClass.contains(SrcReg)) {
4756 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4757 .addReg(SrcReg, getKillRegState(KillSrc));
4758 return;
4759 }
4760 if (AArch64::GPR64RegClass.contains(DestReg) &&
4761 AArch64::FPR64RegClass.contains(SrcReg)) {
4762 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4763 .addReg(SrcReg, getKillRegState(KillSrc));
4764 return;
4765 }
4766 // Copies between GPR32 and FPR32.
4767 if (AArch64::FPR32RegClass.contains(DestReg) &&
4768 AArch64::GPR32RegClass.contains(SrcReg)) {
4769 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4770 .addReg(SrcReg, getKillRegState(KillSrc));
4771 return;
4772 }
4773 if (AArch64::GPR32RegClass.contains(DestReg) &&
4774 AArch64::FPR32RegClass.contains(SrcReg)) {
4775 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4776 .addReg(SrcReg, getKillRegState(KillSrc));
4777 return;
4778 }
4779
4780 if (DestReg == AArch64::NZCV) {
4781 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4782 BuildMI(MBB, I, DL, get(AArch64::MSR))
4783 .addImm(AArch64SysReg::NZCV)
4784 .addReg(SrcReg, getKillRegState(KillSrc))
4785 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4786 return;
4787 }
4788
4789 if (SrcReg == AArch64::NZCV) {
4790 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4791 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4792 .addImm(AArch64SysReg::NZCV)
4793 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4794 return;
4795 }
4796
4797 #ifndef NDEBUG
4798 const TargetRegisterInfo &TRI = getRegisterInfo();
4799 errs() << TRI.getRegAsmName(DestReg) << " = COPY "
4800 << TRI.getRegAsmName(SrcReg) << "\n";
4801 #endif
4802 llvm_unreachable("unimplemented reg-to-reg copy");
4803 }
4804
storeRegPairToStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register SrcReg,bool IsKill,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4805 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4806 MachineBasicBlock &MBB,
4807 MachineBasicBlock::iterator InsertBefore,
4808 const MCInstrDesc &MCID,
4809 Register SrcReg, bool IsKill,
4810 unsigned SubIdx0, unsigned SubIdx1, int FI,
4811 MachineMemOperand *MMO) {
4812 Register SrcReg0 = SrcReg;
4813 Register SrcReg1 = SrcReg;
4814 if (SrcReg.isPhysical()) {
4815 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
4816 SubIdx0 = 0;
4817 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1);
4818 SubIdx1 = 0;
4819 }
4820 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4821 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0)
4822 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1)
4823 .addFrameIndex(FI)
4824 .addImm(0)
4825 .addMemOperand(MMO);
4826 }
4827
storeRegToStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register SrcReg,bool isKill,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const4828 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4829 MachineBasicBlock::iterator MBBI,
4830 Register SrcReg, bool isKill, int FI,
4831 const TargetRegisterClass *RC,
4832 const TargetRegisterInfo *TRI,
4833 Register VReg) const {
4834 MachineFunction &MF = *MBB.getParent();
4835 MachineFrameInfo &MFI = MF.getFrameInfo();
4836
4837 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4838 MachineMemOperand *MMO =
4839 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
4840 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
4841 unsigned Opc = 0;
4842 bool Offset = true;
4843 MCRegister PNRReg = MCRegister::NoRegister;
4844 unsigned StackID = TargetStackID::Default;
4845 switch (TRI->getSpillSize(*RC)) {
4846 case 1:
4847 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4848 Opc = AArch64::STRBui;
4849 break;
4850 case 2: {
4851 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4852 Opc = AArch64::STRHui;
4853 else if (AArch64::PNRRegClass.hasSubClassEq(RC) ||
4854 AArch64::PPRRegClass.hasSubClassEq(RC)) {
4855 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4856 "Unexpected register store without SVE store instructions");
4857 Opc = AArch64::STR_PXI;
4858 StackID = TargetStackID::ScalableVector;
4859 }
4860 break;
4861 }
4862 case 4:
4863 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4864 Opc = AArch64::STRWui;
4865 if (SrcReg.isVirtual())
4866 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4867 else
4868 assert(SrcReg != AArch64::WSP);
4869 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4870 Opc = AArch64::STRSui;
4871 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4872 Opc = AArch64::STR_PPXI;
4873 StackID = TargetStackID::ScalableVector;
4874 }
4875 break;
4876 case 8:
4877 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4878 Opc = AArch64::STRXui;
4879 if (SrcReg.isVirtual())
4880 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4881 else
4882 assert(SrcReg != AArch64::SP);
4883 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4884 Opc = AArch64::STRDui;
4885 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4886 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4887 get(AArch64::STPWi), SrcReg, isKill,
4888 AArch64::sube32, AArch64::subo32, FI, MMO);
4889 return;
4890 }
4891 break;
4892 case 16:
4893 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4894 Opc = AArch64::STRQui;
4895 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4896 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4897 Opc = AArch64::ST1Twov1d;
4898 Offset = false;
4899 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4900 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4901 get(AArch64::STPXi), SrcReg, isKill,
4902 AArch64::sube64, AArch64::subo64, FI, MMO);
4903 return;
4904 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4905 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4906 "Unexpected register store without SVE store instructions");
4907 Opc = AArch64::STR_ZXI;
4908 StackID = TargetStackID::ScalableVector;
4909 }
4910 break;
4911 case 24:
4912 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4913 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4914 Opc = AArch64::ST1Threev1d;
4915 Offset = false;
4916 }
4917 break;
4918 case 32:
4919 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4920 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4921 Opc = AArch64::ST1Fourv1d;
4922 Offset = false;
4923 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4924 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4925 Opc = AArch64::ST1Twov2d;
4926 Offset = false;
4927 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4928 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4929 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4930 "Unexpected register store without SVE store instructions");
4931 Opc = AArch64::STR_ZZXI;
4932 StackID = TargetStackID::ScalableVector;
4933 }
4934 break;
4935 case 48:
4936 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4937 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4938 Opc = AArch64::ST1Threev2d;
4939 Offset = false;
4940 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4941 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4942 "Unexpected register store without SVE store instructions");
4943 Opc = AArch64::STR_ZZZXI;
4944 StackID = TargetStackID::ScalableVector;
4945 }
4946 break;
4947 case 64:
4948 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4949 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4950 Opc = AArch64::ST1Fourv2d;
4951 Offset = false;
4952 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4953 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4954 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
4955 "Unexpected register store without SVE store instructions");
4956 Opc = AArch64::STR_ZZZZXI;
4957 StackID = TargetStackID::ScalableVector;
4958 }
4959 break;
4960 }
4961 assert(Opc && "Unknown register class");
4962 MFI.setStackID(FI, StackID);
4963
4964 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4965 .addReg(SrcReg, getKillRegState(isKill))
4966 .addFrameIndex(FI);
4967
4968 if (Offset)
4969 MI.addImm(0);
4970 if (PNRReg.isValid())
4971 MI.addDef(PNRReg, RegState::Implicit);
4972 MI.addMemOperand(MMO);
4973 }
4974
loadRegPairFromStackSlot(const TargetRegisterInfo & TRI,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertBefore,const MCInstrDesc & MCID,Register DestReg,unsigned SubIdx0,unsigned SubIdx1,int FI,MachineMemOperand * MMO)4975 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4976 MachineBasicBlock &MBB,
4977 MachineBasicBlock::iterator InsertBefore,
4978 const MCInstrDesc &MCID,
4979 Register DestReg, unsigned SubIdx0,
4980 unsigned SubIdx1, int FI,
4981 MachineMemOperand *MMO) {
4982 Register DestReg0 = DestReg;
4983 Register DestReg1 = DestReg;
4984 bool IsUndef = true;
4985 if (DestReg.isPhysical()) {
4986 DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
4987 SubIdx0 = 0;
4988 DestReg1 = TRI.getSubReg(DestReg, SubIdx1);
4989 SubIdx1 = 0;
4990 IsUndef = false;
4991 }
4992 BuildMI(MBB, InsertBefore, DebugLoc(), MCID)
4993 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0)
4994 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1)
4995 .addFrameIndex(FI)
4996 .addImm(0)
4997 .addMemOperand(MMO);
4998 }
4999
loadRegFromStackSlot(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,Register DestReg,int FI,const TargetRegisterClass * RC,const TargetRegisterInfo * TRI,Register VReg) const5000 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
5001 MachineBasicBlock::iterator MBBI,
5002 Register DestReg, int FI,
5003 const TargetRegisterClass *RC,
5004 const TargetRegisterInfo *TRI,
5005 Register VReg) const {
5006 MachineFunction &MF = *MBB.getParent();
5007 MachineFrameInfo &MFI = MF.getFrameInfo();
5008 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
5009 MachineMemOperand *MMO =
5010 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
5011 MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
5012
5013 unsigned Opc = 0;
5014 bool Offset = true;
5015 unsigned StackID = TargetStackID::Default;
5016 Register PNRReg = MCRegister::NoRegister;
5017 switch (TRI->getSpillSize(*RC)) {
5018 case 1:
5019 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
5020 Opc = AArch64::LDRBui;
5021 break;
5022 case 2: {
5023 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
5024 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
5025 Opc = AArch64::LDRHui;
5026 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
5027 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5028 "Unexpected register load without SVE load instructions");
5029 if (IsPNR)
5030 PNRReg = DestReg;
5031 Opc = AArch64::LDR_PXI;
5032 StackID = TargetStackID::ScalableVector;
5033 }
5034 break;
5035 }
5036 case 4:
5037 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5038 Opc = AArch64::LDRWui;
5039 if (DestReg.isVirtual())
5040 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5041 else
5042 assert(DestReg != AArch64::WSP);
5043 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5044 Opc = AArch64::LDRSui;
5045 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5046 Opc = AArch64::LDR_PPXI;
5047 StackID = TargetStackID::ScalableVector;
5048 }
5049 break;
5050 case 8:
5051 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5052 Opc = AArch64::LDRXui;
5053 if (DestReg.isVirtual())
5054 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5055 else
5056 assert(DestReg != AArch64::SP);
5057 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5058 Opc = AArch64::LDRDui;
5059 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5060 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5061 get(AArch64::LDPWi), DestReg, AArch64::sube32,
5062 AArch64::subo32, FI, MMO);
5063 return;
5064 }
5065 break;
5066 case 16:
5067 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5068 Opc = AArch64::LDRQui;
5069 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5070 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5071 Opc = AArch64::LD1Twov1d;
5072 Offset = false;
5073 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5074 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5075 get(AArch64::LDPXi), DestReg, AArch64::sube64,
5076 AArch64::subo64, FI, MMO);
5077 return;
5078 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5079 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5080 "Unexpected register load without SVE load instructions");
5081 Opc = AArch64::LDR_ZXI;
5082 StackID = TargetStackID::ScalableVector;
5083 }
5084 break;
5085 case 24:
5086 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5087 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5088 Opc = AArch64::LD1Threev1d;
5089 Offset = false;
5090 }
5091 break;
5092 case 32:
5093 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5094 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5095 Opc = AArch64::LD1Fourv1d;
5096 Offset = false;
5097 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5098 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5099 Opc = AArch64::LD1Twov2d;
5100 Offset = false;
5101 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5102 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5103 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5104 "Unexpected register load without SVE load instructions");
5105 Opc = AArch64::LDR_ZZXI;
5106 StackID = TargetStackID::ScalableVector;
5107 }
5108 break;
5109 case 48:
5110 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5111 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5112 Opc = AArch64::LD1Threev2d;
5113 Offset = false;
5114 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5115 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5116 "Unexpected register load without SVE load instructions");
5117 Opc = AArch64::LDR_ZZZXI;
5118 StackID = TargetStackID::ScalableVector;
5119 }
5120 break;
5121 case 64:
5122 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5123 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5124 Opc = AArch64::LD1Fourv2d;
5125 Offset = false;
5126 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5127 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5128 assert(Subtarget.isSVEorStreamingSVEAvailable() &&
5129 "Unexpected register load without SVE load instructions");
5130 Opc = AArch64::LDR_ZZZZXI;
5131 StackID = TargetStackID::ScalableVector;
5132 }
5133 break;
5134 }
5135
5136 assert(Opc && "Unknown register class");
5137 MFI.setStackID(FI, StackID);
5138
5139 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5140 .addReg(DestReg, getDefRegState(true))
5141 .addFrameIndex(FI);
5142 if (Offset)
5143 MI.addImm(0);
5144 if (PNRReg.isValid() && !PNRReg.isVirtual())
5145 MI.addDef(PNRReg, RegState::Implicit);
5146 MI.addMemOperand(MMO);
5147 }
5148
isNZCVTouchedInInstructionRange(const MachineInstr & DefMI,const MachineInstr & UseMI,const TargetRegisterInfo * TRI)5149 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5150 const MachineInstr &UseMI,
5151 const TargetRegisterInfo *TRI) {
5152 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
5153 UseMI.getIterator()),
5154 [TRI](const MachineInstr &I) {
5155 return I.modifiesRegister(AArch64::NZCV, TRI) ||
5156 I.readsRegister(AArch64::NZCV, TRI);
5157 });
5158 }
5159
decomposeStackOffsetForDwarfOffsets(const StackOffset & Offset,int64_t & ByteSized,int64_t & VGSized)5160 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5161 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5162 // The smallest scalable element supported by scaled SVE addressing
5163 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5164 // byte offset must always be a multiple of 2.
5165 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5166
5167 // VGSized offsets are divided by '2', because the VG register is the
5168 // the number of 64bit granules as opposed to 128bit vector chunks,
5169 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5170 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5171 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5172 ByteSized = Offset.getFixed();
5173 VGSized = Offset.getScalable() / 2;
5174 }
5175
5176 /// Returns the offset in parts to which this frame offset can be
5177 /// decomposed for the purpose of describing a frame offset.
5178 /// For non-scalable offsets this is simply its byte size.
decomposeStackOffsetForFrameOffsets(const StackOffset & Offset,int64_t & NumBytes,int64_t & NumPredicateVectors,int64_t & NumDataVectors)5179 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5180 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5181 int64_t &NumDataVectors) {
5182 // The smallest scalable element supported by scaled SVE addressing
5183 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5184 // byte offset must always be a multiple of 2.
5185 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5186
5187 NumBytes = Offset.getFixed();
5188 NumDataVectors = 0;
5189 NumPredicateVectors = Offset.getScalable() / 2;
5190 // This method is used to get the offsets to adjust the frame offset.
5191 // If the function requires ADDPL to be used and needs more than two ADDPL
5192 // instructions, part of the offset is folded into NumDataVectors so that it
5193 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5194 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5195 NumPredicateVectors > 62) {
5196 NumDataVectors = NumPredicateVectors / 8;
5197 NumPredicateVectors -= NumDataVectors * 8;
5198 }
5199 }
5200
5201 // Convenience function to create a DWARF expression for
5202 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG
appendVGScaledOffsetExpr(SmallVectorImpl<char> & Expr,int NumBytes,int NumVGScaledBytes,unsigned VG,llvm::raw_string_ostream & Comment)5203 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5204 int NumVGScaledBytes, unsigned VG,
5205 llvm::raw_string_ostream &Comment) {
5206 uint8_t buffer[16];
5207
5208 if (NumBytes) {
5209 Expr.push_back(dwarf::DW_OP_consts);
5210 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer));
5211 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5212 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes);
5213 }
5214
5215 if (NumVGScaledBytes) {
5216 Expr.push_back((uint8_t)dwarf::DW_OP_consts);
5217 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer));
5218
5219 Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
5220 Expr.append(buffer, buffer + encodeULEB128(VG, buffer));
5221 Expr.push_back(0);
5222
5223 Expr.push_back((uint8_t)dwarf::DW_OP_mul);
5224 Expr.push_back((uint8_t)dwarf::DW_OP_plus);
5225
5226 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5227 << std::abs(NumVGScaledBytes) << " * VG";
5228 }
5229 }
5230
5231 // Creates an MCCFIInstruction:
5232 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
createDefCFAExpression(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & Offset)5233 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5234 unsigned Reg,
5235 const StackOffset &Offset) {
5236 int64_t NumBytes, NumVGScaledBytes;
5237 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes,
5238 NumVGScaledBytes);
5239 std::string CommentBuffer;
5240 llvm::raw_string_ostream Comment(CommentBuffer);
5241
5242 if (Reg == AArch64::SP)
5243 Comment << "sp";
5244 else if (Reg == AArch64::FP)
5245 Comment << "fp";
5246 else
5247 Comment << printReg(Reg, &TRI);
5248
5249 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5250 SmallString<64> Expr;
5251 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5252 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5253 Expr.push_back(0);
5254 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5255 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5256
5257 // Wrap this into DW_CFA_def_cfa.
5258 SmallString<64> DefCfaExpr;
5259 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
5260 uint8_t buffer[16];
5261 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
5262 DefCfaExpr.append(Expr.str());
5263 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
5264 Comment.str());
5265 }
5266
createDefCFA(const TargetRegisterInfo & TRI,unsigned FrameReg,unsigned Reg,const StackOffset & Offset,bool LastAdjustmentWasScalable)5267 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5268 unsigned FrameReg, unsigned Reg,
5269 const StackOffset &Offset,
5270 bool LastAdjustmentWasScalable) {
5271 if (Offset.getScalable())
5272 return createDefCFAExpression(TRI, Reg, Offset);
5273
5274 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5275 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed()));
5276
5277 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5278 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
5279 }
5280
createCFAOffset(const TargetRegisterInfo & TRI,unsigned Reg,const StackOffset & OffsetFromDefCFA)5281 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5282 unsigned Reg,
5283 const StackOffset &OffsetFromDefCFA) {
5284 int64_t NumBytes, NumVGScaledBytes;
5285 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5286 OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
5287
5288 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
5289
5290 // Non-scalable offsets can use DW_CFA_offset directly.
5291 if (!NumVGScaledBytes)
5292 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes);
5293
5294 std::string CommentBuffer;
5295 llvm::raw_string_ostream Comment(CommentBuffer);
5296 Comment << printReg(Reg, &TRI) << " @ cfa";
5297
5298 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5299 SmallString<64> OffsetExpr;
5300 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5301 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5302
5303 // Wrap this into DW_CFA_expression
5304 SmallString<64> CfaExpr;
5305 CfaExpr.push_back(dwarf::DW_CFA_expression);
5306 uint8_t buffer[16];
5307 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer));
5308 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer));
5309 CfaExpr.append(OffsetExpr.str());
5310
5311 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(),
5312 Comment.str());
5313 }
5314
5315 // Helper function to emit a frame offset adjustment from a given
5316 // pointer (SrcReg), stored into DestReg. This function is explicit
5317 // in that it requires the opcode.
emitFrameOffsetAdj(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,int64_t Offset,unsigned Opc,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5318 static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5319 MachineBasicBlock::iterator MBBI,
5320 const DebugLoc &DL, unsigned DestReg,
5321 unsigned SrcReg, int64_t Offset, unsigned Opc,
5322 const TargetInstrInfo *TII,
5323 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5324 bool *HasWinCFI, bool EmitCFAOffset,
5325 StackOffset CFAOffset, unsigned FrameReg) {
5326 int Sign = 1;
5327 unsigned MaxEncoding, ShiftSize;
5328 switch (Opc) {
5329 case AArch64::ADDXri:
5330 case AArch64::ADDSXri:
5331 case AArch64::SUBXri:
5332 case AArch64::SUBSXri:
5333 MaxEncoding = 0xfff;
5334 ShiftSize = 12;
5335 break;
5336 case AArch64::ADDVL_XXI:
5337 case AArch64::ADDPL_XXI:
5338 case AArch64::ADDSVL_XXI:
5339 case AArch64::ADDSPL_XXI:
5340 MaxEncoding = 31;
5341 ShiftSize = 0;
5342 if (Offset < 0) {
5343 MaxEncoding = 32;
5344 Sign = -1;
5345 Offset = -Offset;
5346 }
5347 break;
5348 default:
5349 llvm_unreachable("Unsupported opcode");
5350 }
5351
5352 // `Offset` can be in bytes or in "scalable bytes".
5353 int VScale = 1;
5354 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5355 VScale = 16;
5356 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5357 VScale = 2;
5358
5359 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5360 // scratch register. If DestReg is a virtual register, use it as the
5361 // scratch register; otherwise, create a new virtual register (to be
5362 // replaced by the scavenger at the end of PEI). That case can be optimized
5363 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5364 // register can be loaded with offset%8 and the add/sub can use an extending
5365 // instruction with LSL#3.
5366 // Currently the function handles any offsets but generates a poor sequence
5367 // of code.
5368 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5369
5370 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5371 Register TmpReg = DestReg;
5372 if (TmpReg == AArch64::XZR)
5373 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5374 &AArch64::GPR64RegClass);
5375 do {
5376 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
5377 unsigned LocalShiftSize = 0;
5378 if (ThisVal > MaxEncoding) {
5379 ThisVal = ThisVal >> ShiftSize;
5380 LocalShiftSize = ShiftSize;
5381 }
5382 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5383 "Encoding cannot handle value that big");
5384
5385 Offset -= ThisVal << LocalShiftSize;
5386 if (Offset == 0)
5387 TmpReg = DestReg;
5388 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
5389 .addReg(SrcReg)
5390 .addImm(Sign * (int)ThisVal);
5391 if (ShiftSize)
5392 MBI = MBI.addImm(
5393 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize));
5394 MBI = MBI.setMIFlag(Flag);
5395
5396 auto Change =
5397 VScale == 1
5398 ? StackOffset::getFixed(ThisVal << LocalShiftSize)
5399 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize));
5400 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5401 CFAOffset += Change;
5402 else
5403 CFAOffset -= Change;
5404 if (EmitCFAOffset && DestReg == TmpReg) {
5405 MachineFunction &MF = *MBB.getParent();
5406 const TargetSubtargetInfo &STI = MF.getSubtarget();
5407 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5408
5409 unsigned CFIIndex = MF.addFrameInst(
5410 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1));
5411 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
5412 .addCFIIndex(CFIIndex)
5413 .setMIFlags(Flag);
5414 }
5415
5416 if (NeedsWinCFI) {
5417 assert(Sign == 1 && "SEH directives should always have a positive sign");
5418 int Imm = (int)(ThisVal << LocalShiftSize);
5419 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5420 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5421 if (HasWinCFI)
5422 *HasWinCFI = true;
5423 if (Imm == 0)
5424 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5425 else
5426 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5427 .addImm(Imm)
5428 .setMIFlag(Flag);
5429 assert(Offset == 0 && "Expected remaining offset to be zero to "
5430 "emit a single SEH directive");
5431 } else if (DestReg == AArch64::SP) {
5432 if (HasWinCFI)
5433 *HasWinCFI = true;
5434 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5435 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5436 .addImm(Imm)
5437 .setMIFlag(Flag);
5438 }
5439 }
5440
5441 SrcReg = TmpReg;
5442 } while (Offset);
5443 }
5444
emitFrameOffset(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,unsigned DestReg,unsigned SrcReg,StackOffset Offset,const TargetInstrInfo * TII,MachineInstr::MIFlag Flag,bool SetNZCV,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFAOffset,StackOffset CFAOffset,unsigned FrameReg)5445 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5446 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5447 unsigned DestReg, unsigned SrcReg,
5448 StackOffset Offset, const TargetInstrInfo *TII,
5449 MachineInstr::MIFlag Flag, bool SetNZCV,
5450 bool NeedsWinCFI, bool *HasWinCFI,
5451 bool EmitCFAOffset, StackOffset CFAOffset,
5452 unsigned FrameReg) {
5453 // If a function is marked as arm_locally_streaming, then the runtime value of
5454 // vscale in the prologue/epilogue is different the runtime value of vscale
5455 // in the function's body. To avoid having to consider multiple vscales,
5456 // we can use `addsvl` to allocate any scalable stack-slots, which under
5457 // most circumstances will be only locals, not callee-save slots.
5458 const Function &F = MBB.getParent()->getFunction();
5459 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body");
5460
5461 int64_t Bytes, NumPredicateVectors, NumDataVectors;
5462 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5463 Offset, Bytes, NumPredicateVectors, NumDataVectors);
5464
5465 // First emit non-scalable frame offsets, or a simple 'mov'.
5466 if (Bytes || (!Offset && SrcReg != DestReg)) {
5467 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5468 "SP increment/decrement not 8-byte aligned");
5469 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5470 if (Bytes < 0) {
5471 Bytes = -Bytes;
5472 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5473 }
5474 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag,
5475 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5476 FrameReg);
5477 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5478 ? StackOffset::getFixed(-Bytes)
5479 : StackOffset::getFixed(Bytes);
5480 SrcReg = DestReg;
5481 FrameReg = DestReg;
5482 }
5483
5484 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5485 "SetNZCV not supported with SVE vectors");
5486 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5487 "WinCFI not supported with SVE vectors");
5488
5489 if (NumDataVectors) {
5490 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5491 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5492 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5493 CFAOffset, FrameReg);
5494 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16);
5495 SrcReg = DestReg;
5496 }
5497
5498 if (NumPredicateVectors) {
5499 assert(DestReg != AArch64::SP && "Unaligned access to SP");
5500 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5501 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5502 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5503 CFAOffset, FrameReg);
5504 }
5505 }
5506
foldMemoryOperandImpl(MachineFunction & MF,MachineInstr & MI,ArrayRef<unsigned> Ops,MachineBasicBlock::iterator InsertPt,int FrameIndex,LiveIntervals * LIS,VirtRegMap * VRM) const5507 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5508 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5509 MachineBasicBlock::iterator InsertPt, int FrameIndex,
5510 LiveIntervals *LIS, VirtRegMap *VRM) const {
5511 // This is a bit of a hack. Consider this instruction:
5512 //
5513 // %0 = COPY %sp; GPR64all:%0
5514 //
5515 // We explicitly chose GPR64all for the virtual register so such a copy might
5516 // be eliminated by RegisterCoalescer. However, that may not be possible, and
5517 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5518 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5519 //
5520 // To prevent that, we are going to constrain the %0 register class here.
5521 if (MI.isFullCopy()) {
5522 Register DstReg = MI.getOperand(0).getReg();
5523 Register SrcReg = MI.getOperand(1).getReg();
5524 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5525 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5526 return nullptr;
5527 }
5528 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5529 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5530 return nullptr;
5531 }
5532 // Nothing can folded with copy from/to NZCV.
5533 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5534 return nullptr;
5535 }
5536
5537 // Handle the case where a copy is being spilled or filled but the source
5538 // and destination register class don't match. For example:
5539 //
5540 // %0 = COPY %xzr; GPR64common:%0
5541 //
5542 // In this case we can still safely fold away the COPY and generate the
5543 // following spill code:
5544 //
5545 // STRXui %xzr, %stack.0
5546 //
5547 // This also eliminates spilled cross register class COPYs (e.g. between x and
5548 // d regs) of the same size. For example:
5549 //
5550 // %0 = COPY %1; GPR64:%0, FPR64:%1
5551 //
5552 // will be filled as
5553 //
5554 // LDRDui %0, fi<#0>
5555 //
5556 // instead of
5557 //
5558 // LDRXui %Temp, fi<#0>
5559 // %0 = FMOV %Temp
5560 //
5561 if (MI.isCopy() && Ops.size() == 1 &&
5562 // Make sure we're only folding the explicit COPY defs/uses.
5563 (Ops[0] == 0 || Ops[0] == 1)) {
5564 bool IsSpill = Ops[0] == 0;
5565 bool IsFill = !IsSpill;
5566 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5567 const MachineRegisterInfo &MRI = MF.getRegInfo();
5568 MachineBasicBlock &MBB = *MI.getParent();
5569 const MachineOperand &DstMO = MI.getOperand(0);
5570 const MachineOperand &SrcMO = MI.getOperand(1);
5571 Register DstReg = DstMO.getReg();
5572 Register SrcReg = SrcMO.getReg();
5573 // This is slightly expensive to compute for physical regs since
5574 // getMinimalPhysRegClass is slow.
5575 auto getRegClass = [&](unsigned Reg) {
5576 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5577 : TRI.getMinimalPhysRegClass(Reg);
5578 };
5579
5580 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5581 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5582 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5583 "Mismatched register size in non subreg COPY");
5584 if (IsSpill)
5585 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
5586 getRegClass(SrcReg), &TRI, Register());
5587 else
5588 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex,
5589 getRegClass(DstReg), &TRI, Register());
5590 return &*--InsertPt;
5591 }
5592
5593 // Handle cases like spilling def of:
5594 //
5595 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5596 //
5597 // where the physical register source can be widened and stored to the full
5598 // virtual reg destination stack slot, in this case producing:
5599 //
5600 // STRXui %xzr, %stack.0
5601 //
5602 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5603 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5604 assert(SrcMO.getSubReg() == 0 &&
5605 "Unexpected subreg on physical register");
5606 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5607 FrameIndex, &AArch64::GPR64RegClass, &TRI,
5608 Register());
5609 return &*--InsertPt;
5610 }
5611
5612 // Handle cases like filling use of:
5613 //
5614 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5615 //
5616 // where we can load the full virtual reg source stack slot, into the subreg
5617 // destination, in this case producing:
5618 //
5619 // LDRWui %0:sub_32<def,read-undef>, %stack.0
5620 //
5621 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5622 const TargetRegisterClass *FillRC;
5623 switch (DstMO.getSubReg()) {
5624 default:
5625 FillRC = nullptr;
5626 break;
5627 case AArch64::sub_32:
5628 FillRC = &AArch64::GPR32RegClass;
5629 break;
5630 case AArch64::ssub:
5631 FillRC = &AArch64::FPR32RegClass;
5632 break;
5633 case AArch64::dsub:
5634 FillRC = &AArch64::FPR64RegClass;
5635 break;
5636 }
5637
5638 if (FillRC) {
5639 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5640 TRI.getRegSizeInBits(*FillRC) &&
5641 "Mismatched regclass size on folded subreg COPY");
5642 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI,
5643 Register());
5644 MachineInstr &LoadMI = *--InsertPt;
5645 MachineOperand &LoadDst = LoadMI.getOperand(0);
5646 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5647 LoadDst.setSubReg(DstMO.getSubReg());
5648 LoadDst.setIsUndef();
5649 return &LoadMI;
5650 }
5651 }
5652 }
5653
5654 // Cannot fold.
5655 return nullptr;
5656 }
5657
isAArch64FrameOffsetLegal(const MachineInstr & MI,StackOffset & SOffset,bool * OutUseUnscaledOp,unsigned * OutUnscaledOp,int64_t * EmittableOffset)5658 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5659 StackOffset &SOffset,
5660 bool *OutUseUnscaledOp,
5661 unsigned *OutUnscaledOp,
5662 int64_t *EmittableOffset) {
5663 // Set output values in case of early exit.
5664 if (EmittableOffset)
5665 *EmittableOffset = 0;
5666 if (OutUseUnscaledOp)
5667 *OutUseUnscaledOp = false;
5668 if (OutUnscaledOp)
5669 *OutUnscaledOp = 0;
5670
5671 // Exit early for structured vector spills/fills as they can't take an
5672 // immediate offset.
5673 switch (MI.getOpcode()) {
5674 default:
5675 break;
5676 case AArch64::LD1Rv1d:
5677 case AArch64::LD1Rv2s:
5678 case AArch64::LD1Rv2d:
5679 case AArch64::LD1Rv4h:
5680 case AArch64::LD1Rv4s:
5681 case AArch64::LD1Rv8b:
5682 case AArch64::LD1Rv8h:
5683 case AArch64::LD1Rv16b:
5684 case AArch64::LD1Twov2d:
5685 case AArch64::LD1Threev2d:
5686 case AArch64::LD1Fourv2d:
5687 case AArch64::LD1Twov1d:
5688 case AArch64::LD1Threev1d:
5689 case AArch64::LD1Fourv1d:
5690 case AArch64::ST1Twov2d:
5691 case AArch64::ST1Threev2d:
5692 case AArch64::ST1Fourv2d:
5693 case AArch64::ST1Twov1d:
5694 case AArch64::ST1Threev1d:
5695 case AArch64::ST1Fourv1d:
5696 case AArch64::ST1i8:
5697 case AArch64::ST1i16:
5698 case AArch64::ST1i32:
5699 case AArch64::ST1i64:
5700 case AArch64::IRG:
5701 case AArch64::IRGstack:
5702 case AArch64::STGloop:
5703 case AArch64::STZGloop:
5704 return AArch64FrameOffsetCannotUpdate;
5705 }
5706
5707 // Get the min/max offset and the scale.
5708 TypeSize ScaleValue(0U, false), Width(0U, false);
5709 int64_t MinOff, MaxOff;
5710 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
5711 MaxOff))
5712 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5713
5714 // Construct the complete offset.
5715 bool IsMulVL = ScaleValue.isScalable();
5716 unsigned Scale = ScaleValue.getKnownMinValue();
5717 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5718
5719 const MachineOperand &ImmOpnd =
5720 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
5721 Offset += ImmOpnd.getImm() * Scale;
5722
5723 // If the offset doesn't match the scale, we rewrite the instruction to
5724 // use the unscaled instruction instead. Likewise, if we have a negative
5725 // offset and there is an unscaled op to use.
5726 std::optional<unsigned> UnscaledOp =
5727 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
5728 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5729 if (useUnscaledOp &&
5730 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
5731 MaxOff))
5732 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5733
5734 Scale = ScaleValue.getKnownMinValue();
5735 assert(IsMulVL == ScaleValue.isScalable() &&
5736 "Unscaled opcode has different value for scalable");
5737
5738 int64_t Remainder = Offset % Scale;
5739 assert(!(Remainder && useUnscaledOp) &&
5740 "Cannot have remainder when using unscaled op");
5741
5742 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5743 int64_t NewOffset = Offset / Scale;
5744 if (MinOff <= NewOffset && NewOffset <= MaxOff)
5745 Offset = Remainder;
5746 else {
5747 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5748 Offset = Offset - (NewOffset * Scale);
5749 }
5750
5751 if (EmittableOffset)
5752 *EmittableOffset = NewOffset;
5753 if (OutUseUnscaledOp)
5754 *OutUseUnscaledOp = useUnscaledOp;
5755 if (OutUnscaledOp && UnscaledOp)
5756 *OutUnscaledOp = *UnscaledOp;
5757
5758 if (IsMulVL)
5759 SOffset = StackOffset::get(SOffset.getFixed(), Offset);
5760 else
5761 SOffset = StackOffset::get(Offset, SOffset.getScalable());
5762 return AArch64FrameOffsetCanUpdate |
5763 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5764 }
5765
rewriteAArch64FrameIndex(MachineInstr & MI,unsigned FrameRegIdx,unsigned FrameReg,StackOffset & Offset,const AArch64InstrInfo * TII)5766 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5767 unsigned FrameReg, StackOffset &Offset,
5768 const AArch64InstrInfo *TII) {
5769 unsigned Opcode = MI.getOpcode();
5770 unsigned ImmIdx = FrameRegIdx + 1;
5771
5772 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5773 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm());
5774 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5775 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5776 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5777 MI.eraseFromParent();
5778 Offset = StackOffset();
5779 return true;
5780 }
5781
5782 int64_t NewOffset;
5783 unsigned UnscaledOp;
5784 bool UseUnscaledOp;
5785 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
5786 &UnscaledOp, &NewOffset);
5787 if (Status & AArch64FrameOffsetCanUpdate) {
5788 if (Status & AArch64FrameOffsetIsLegal)
5789 // Replace the FrameIndex with FrameReg.
5790 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
5791 if (UseUnscaledOp)
5792 MI.setDesc(TII->get(UnscaledOp));
5793
5794 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
5795 return !Offset;
5796 }
5797
5798 return false;
5799 }
5800
insertNoop(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI) const5801 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5802 MachineBasicBlock::iterator MI) const {
5803 DebugLoc DL;
5804 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5805 }
5806
getNop() const5807 MCInst AArch64InstrInfo::getNop() const {
5808 return MCInstBuilder(AArch64::HINT).addImm(0);
5809 }
5810
5811 // AArch64 supports MachineCombiner.
useMachineCombiner() const5812 bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5813
5814 // True when Opc sets flag
isCombineInstrSettingFlag(unsigned Opc)5815 static bool isCombineInstrSettingFlag(unsigned Opc) {
5816 switch (Opc) {
5817 case AArch64::ADDSWrr:
5818 case AArch64::ADDSWri:
5819 case AArch64::ADDSXrr:
5820 case AArch64::ADDSXri:
5821 case AArch64::SUBSWrr:
5822 case AArch64::SUBSXrr:
5823 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5824 case AArch64::SUBSWri:
5825 case AArch64::SUBSXri:
5826 return true;
5827 default:
5828 break;
5829 }
5830 return false;
5831 }
5832
5833 // 32b Opcodes that can be combined with a MUL
isCombineInstrCandidate32(unsigned Opc)5834 static bool isCombineInstrCandidate32(unsigned Opc) {
5835 switch (Opc) {
5836 case AArch64::ADDWrr:
5837 case AArch64::ADDWri:
5838 case AArch64::SUBWrr:
5839 case AArch64::ADDSWrr:
5840 case AArch64::ADDSWri:
5841 case AArch64::SUBSWrr:
5842 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5843 case AArch64::SUBWri:
5844 case AArch64::SUBSWri:
5845 return true;
5846 default:
5847 break;
5848 }
5849 return false;
5850 }
5851
5852 // 64b Opcodes that can be combined with a MUL
isCombineInstrCandidate64(unsigned Opc)5853 static bool isCombineInstrCandidate64(unsigned Opc) {
5854 switch (Opc) {
5855 case AArch64::ADDXrr:
5856 case AArch64::ADDXri:
5857 case AArch64::SUBXrr:
5858 case AArch64::ADDSXrr:
5859 case AArch64::ADDSXri:
5860 case AArch64::SUBSXrr:
5861 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5862 case AArch64::SUBXri:
5863 case AArch64::SUBSXri:
5864 case AArch64::ADDv8i8:
5865 case AArch64::ADDv16i8:
5866 case AArch64::ADDv4i16:
5867 case AArch64::ADDv8i16:
5868 case AArch64::ADDv2i32:
5869 case AArch64::ADDv4i32:
5870 case AArch64::SUBv8i8:
5871 case AArch64::SUBv16i8:
5872 case AArch64::SUBv4i16:
5873 case AArch64::SUBv8i16:
5874 case AArch64::SUBv2i32:
5875 case AArch64::SUBv4i32:
5876 return true;
5877 default:
5878 break;
5879 }
5880 return false;
5881 }
5882
5883 // FP Opcodes that can be combined with a FMUL.
isCombineInstrCandidateFP(const MachineInstr & Inst)5884 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5885 switch (Inst.getOpcode()) {
5886 default:
5887 break;
5888 case AArch64::FADDHrr:
5889 case AArch64::FADDSrr:
5890 case AArch64::FADDDrr:
5891 case AArch64::FADDv4f16:
5892 case AArch64::FADDv8f16:
5893 case AArch64::FADDv2f32:
5894 case AArch64::FADDv2f64:
5895 case AArch64::FADDv4f32:
5896 case AArch64::FSUBHrr:
5897 case AArch64::FSUBSrr:
5898 case AArch64::FSUBDrr:
5899 case AArch64::FSUBv4f16:
5900 case AArch64::FSUBv8f16:
5901 case AArch64::FSUBv2f32:
5902 case AArch64::FSUBv2f64:
5903 case AArch64::FSUBv4f32:
5904 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5905 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5906 // the target options or if FADD/FSUB has the contract fast-math flag.
5907 return Options.UnsafeFPMath ||
5908 Options.AllowFPOpFusion == FPOpFusion::Fast ||
5909 Inst.getFlag(MachineInstr::FmContract);
5910 return true;
5911 }
5912 return false;
5913 }
5914
5915 // Opcodes that can be combined with a MUL
isCombineInstrCandidate(unsigned Opc)5916 static bool isCombineInstrCandidate(unsigned Opc) {
5917 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5918 }
5919
5920 //
5921 // Utility routine that checks if \param MO is defined by an
5922 // \param CombineOpc instruction in the basic block \param MBB
canCombine(MachineBasicBlock & MBB,MachineOperand & MO,unsigned CombineOpc,unsigned ZeroReg=0,bool CheckZeroReg=false)5923 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5924 unsigned CombineOpc, unsigned ZeroReg = 0,
5925 bool CheckZeroReg = false) {
5926 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5927 MachineInstr *MI = nullptr;
5928
5929 if (MO.isReg() && MO.getReg().isVirtual())
5930 MI = MRI.getUniqueVRegDef(MO.getReg());
5931 // And it needs to be in the trace (otherwise, it won't have a depth).
5932 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5933 return false;
5934 // Must only used by the user we combine with.
5935 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
5936 return false;
5937
5938 if (CheckZeroReg) {
5939 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5940 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5941 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5942 // The third input reg must be zero.
5943 if (MI->getOperand(3).getReg() != ZeroReg)
5944 return false;
5945 }
5946
5947 if (isCombineInstrSettingFlag(CombineOpc) &&
5948 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
5949 return false;
5950
5951 return true;
5952 }
5953
5954 //
5955 // Is \param MO defined by an integer multiply and can be combined?
canCombineWithMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc,unsigned ZeroReg)5956 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5957 unsigned MulOpc, unsigned ZeroReg) {
5958 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
5959 }
5960
5961 //
5962 // Is \param MO defined by a floating-point multiply and can be combined?
canCombineWithFMUL(MachineBasicBlock & MBB,MachineOperand & MO,unsigned MulOpc)5963 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5964 unsigned MulOpc) {
5965 return canCombine(MBB, MO, MulOpc);
5966 }
5967
5968 // TODO: There are many more machine instruction opcodes to match:
5969 // 1. Other data types (integer, vectors)
5970 // 2. Other math / logic operations (xor, or)
5971 // 3. Other forms of the same operation (intrinsics and other variants)
isAssociativeAndCommutative(const MachineInstr & Inst,bool Invert) const5972 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5973 bool Invert) const {
5974 if (Invert)
5975 return false;
5976 switch (Inst.getOpcode()) {
5977 // == Floating-point types ==
5978 // -- Floating-point instructions --
5979 case AArch64::FADDHrr:
5980 case AArch64::FADDSrr:
5981 case AArch64::FADDDrr:
5982 case AArch64::FMULHrr:
5983 case AArch64::FMULSrr:
5984 case AArch64::FMULDrr:
5985 case AArch64::FMULX16:
5986 case AArch64::FMULX32:
5987 case AArch64::FMULX64:
5988 // -- Advanced SIMD instructions --
5989 case AArch64::FADDv4f16:
5990 case AArch64::FADDv8f16:
5991 case AArch64::FADDv2f32:
5992 case AArch64::FADDv4f32:
5993 case AArch64::FADDv2f64:
5994 case AArch64::FMULv4f16:
5995 case AArch64::FMULv8f16:
5996 case AArch64::FMULv2f32:
5997 case AArch64::FMULv4f32:
5998 case AArch64::FMULv2f64:
5999 case AArch64::FMULXv4f16:
6000 case AArch64::FMULXv8f16:
6001 case AArch64::FMULXv2f32:
6002 case AArch64::FMULXv4f32:
6003 case AArch64::FMULXv2f64:
6004 // -- SVE instructions --
6005 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
6006 // in the SVE instruction set (though there are predicated ones).
6007 case AArch64::FADD_ZZZ_H:
6008 case AArch64::FADD_ZZZ_S:
6009 case AArch64::FADD_ZZZ_D:
6010 case AArch64::FMUL_ZZZ_H:
6011 case AArch64::FMUL_ZZZ_S:
6012 case AArch64::FMUL_ZZZ_D:
6013 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
6014 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
6015 Inst.getFlag(MachineInstr::MIFlag::FmNsz));
6016
6017 // == Integer types ==
6018 // -- Base instructions --
6019 // Opcodes MULWrr and MULXrr don't exist because
6020 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
6021 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
6022 // The machine-combiner does not support three-source-operands machine
6023 // instruction. So we cannot reassociate MULs.
6024 case AArch64::ADDWrr:
6025 case AArch64::ADDXrr:
6026 case AArch64::ANDWrr:
6027 case AArch64::ANDXrr:
6028 case AArch64::ORRWrr:
6029 case AArch64::ORRXrr:
6030 case AArch64::EORWrr:
6031 case AArch64::EORXrr:
6032 case AArch64::EONWrr:
6033 case AArch64::EONXrr:
6034 // -- Advanced SIMD instructions --
6035 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6036 // in the Advanced SIMD instruction set.
6037 case AArch64::ADDv8i8:
6038 case AArch64::ADDv16i8:
6039 case AArch64::ADDv4i16:
6040 case AArch64::ADDv8i16:
6041 case AArch64::ADDv2i32:
6042 case AArch64::ADDv4i32:
6043 case AArch64::ADDv1i64:
6044 case AArch64::ADDv2i64:
6045 case AArch64::MULv8i8:
6046 case AArch64::MULv16i8:
6047 case AArch64::MULv4i16:
6048 case AArch64::MULv8i16:
6049 case AArch64::MULv2i32:
6050 case AArch64::MULv4i32:
6051 case AArch64::ANDv8i8:
6052 case AArch64::ANDv16i8:
6053 case AArch64::ORRv8i8:
6054 case AArch64::ORRv16i8:
6055 case AArch64::EORv8i8:
6056 case AArch64::EORv16i8:
6057 // -- SVE instructions --
6058 case AArch64::ADD_ZZZ_B:
6059 case AArch64::ADD_ZZZ_H:
6060 case AArch64::ADD_ZZZ_S:
6061 case AArch64::ADD_ZZZ_D:
6062 case AArch64::MUL_ZZZ_B:
6063 case AArch64::MUL_ZZZ_H:
6064 case AArch64::MUL_ZZZ_S:
6065 case AArch64::MUL_ZZZ_D:
6066 case AArch64::AND_ZZZ:
6067 case AArch64::ORR_ZZZ:
6068 case AArch64::EOR_ZZZ:
6069 return true;
6070
6071 default:
6072 return false;
6073 }
6074 }
6075
6076 /// Find instructions that can be turned into madd.
getMaddPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6077 static bool getMaddPatterns(MachineInstr &Root,
6078 SmallVectorImpl<unsigned> &Patterns) {
6079 unsigned Opc = Root.getOpcode();
6080 MachineBasicBlock &MBB = *Root.getParent();
6081 bool Found = false;
6082
6083 if (!isCombineInstrCandidate(Opc))
6084 return false;
6085 if (isCombineInstrSettingFlag(Opc)) {
6086 int Cmp_NZCV =
6087 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6088 // When NZCV is live bail out.
6089 if (Cmp_NZCV == -1)
6090 return false;
6091 unsigned NewOpc = convertToNonFlagSettingOpc(Root);
6092 // When opcode can't change bail out.
6093 // CHECKME: do we miss any cases for opcode conversion?
6094 if (NewOpc == Opc)
6095 return false;
6096 Opc = NewOpc;
6097 }
6098
6099 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6100 unsigned Pattern) {
6101 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) {
6102 Patterns.push_back(Pattern);
6103 Found = true;
6104 }
6105 };
6106
6107 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6108 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
6109 Patterns.push_back(Pattern);
6110 Found = true;
6111 }
6112 };
6113
6114 typedef AArch64MachineCombinerPattern MCP;
6115
6116 switch (Opc) {
6117 default:
6118 break;
6119 case AArch64::ADDWrr:
6120 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6121 "ADDWrr does not have register operands");
6122 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6123 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6124 break;
6125 case AArch64::ADDXrr:
6126 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6127 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6128 break;
6129 case AArch64::SUBWrr:
6130 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6131 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6132 break;
6133 case AArch64::SUBXrr:
6134 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6135 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6136 break;
6137 case AArch64::ADDWri:
6138 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6139 break;
6140 case AArch64::ADDXri:
6141 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6142 break;
6143 case AArch64::SUBWri:
6144 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6145 break;
6146 case AArch64::SUBXri:
6147 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6148 break;
6149 case AArch64::ADDv8i8:
6150 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6151 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6152 break;
6153 case AArch64::ADDv16i8:
6154 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6155 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6156 break;
6157 case AArch64::ADDv4i16:
6158 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6159 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6160 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6161 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6162 break;
6163 case AArch64::ADDv8i16:
6164 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6165 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6166 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6167 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6168 break;
6169 case AArch64::ADDv2i32:
6170 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6171 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6172 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6173 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6174 break;
6175 case AArch64::ADDv4i32:
6176 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6177 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6178 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6179 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6180 break;
6181 case AArch64::SUBv8i8:
6182 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6183 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6184 break;
6185 case AArch64::SUBv16i8:
6186 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6187 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6188 break;
6189 case AArch64::SUBv4i16:
6190 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6191 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6192 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6193 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6194 break;
6195 case AArch64::SUBv8i16:
6196 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6197 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6198 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6199 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6200 break;
6201 case AArch64::SUBv2i32:
6202 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6203 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6204 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6205 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6206 break;
6207 case AArch64::SUBv4i32:
6208 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6209 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6210 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6211 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6212 break;
6213 }
6214 return Found;
6215 }
6216 /// Floating-Point Support
6217
6218 /// Find instructions that can be turned into madd.
getFMAPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6219 static bool getFMAPatterns(MachineInstr &Root,
6220 SmallVectorImpl<unsigned> &Patterns) {
6221
6222 if (!isCombineInstrCandidateFP(Root))
6223 return false;
6224
6225 MachineBasicBlock &MBB = *Root.getParent();
6226 bool Found = false;
6227
6228 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6229 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) {
6230 Patterns.push_back(Pattern);
6231 return true;
6232 }
6233 return false;
6234 };
6235
6236 typedef AArch64MachineCombinerPattern MCP;
6237
6238 switch (Root.getOpcode()) {
6239 default:
6240 assert(false && "Unsupported FP instruction in combiner\n");
6241 break;
6242 case AArch64::FADDHrr:
6243 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6244 "FADDHrr does not have register operands");
6245
6246 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6247 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6248 break;
6249 case AArch64::FADDSrr:
6250 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6251 "FADDSrr does not have register operands");
6252
6253 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6254 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6255
6256 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6257 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6258 break;
6259 case AArch64::FADDDrr:
6260 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6261 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6262
6263 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6264 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6265 break;
6266 case AArch64::FADDv4f16:
6267 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6268 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6269
6270 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6271 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6272 break;
6273 case AArch64::FADDv8f16:
6274 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6275 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6276
6277 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6278 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6279 break;
6280 case AArch64::FADDv2f32:
6281 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6282 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6283
6284 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6285 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6286 break;
6287 case AArch64::FADDv2f64:
6288 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6289 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6290
6291 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6292 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6293 break;
6294 case AArch64::FADDv4f32:
6295 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6296 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6297
6298 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6299 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6300 break;
6301 case AArch64::FSUBHrr:
6302 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6303 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6304 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6305 break;
6306 case AArch64::FSUBSrr:
6307 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6308
6309 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6310 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6311
6312 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6313 break;
6314 case AArch64::FSUBDrr:
6315 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6316
6317 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6318 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6319
6320 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6321 break;
6322 case AArch64::FSUBv4f16:
6323 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6324 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6325
6326 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6327 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6328 break;
6329 case AArch64::FSUBv8f16:
6330 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6331 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6332
6333 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6334 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6335 break;
6336 case AArch64::FSUBv2f32:
6337 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6338 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6339
6340 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6341 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6342 break;
6343 case AArch64::FSUBv2f64:
6344 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6345 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6346
6347 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6348 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6349 break;
6350 case AArch64::FSUBv4f32:
6351 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6352 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6353
6354 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6355 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6356 break;
6357 }
6358 return Found;
6359 }
6360
getFMULPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6361 static bool getFMULPatterns(MachineInstr &Root,
6362 SmallVectorImpl<unsigned> &Patterns) {
6363 MachineBasicBlock &MBB = *Root.getParent();
6364 bool Found = false;
6365
6366 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6367 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6368 MachineOperand &MO = Root.getOperand(Operand);
6369 MachineInstr *MI = nullptr;
6370 if (MO.isReg() && MO.getReg().isVirtual())
6371 MI = MRI.getUniqueVRegDef(MO.getReg());
6372 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6373 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6374 MI->getOperand(1).getReg().isVirtual())
6375 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg());
6376 if (MI && MI->getOpcode() == Opcode) {
6377 Patterns.push_back(Pattern);
6378 return true;
6379 }
6380 return false;
6381 };
6382
6383 typedef AArch64MachineCombinerPattern MCP;
6384
6385 switch (Root.getOpcode()) {
6386 default:
6387 return false;
6388 case AArch64::FMULv2f32:
6389 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6390 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6391 break;
6392 case AArch64::FMULv2f64:
6393 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6394 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6395 break;
6396 case AArch64::FMULv4f16:
6397 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6398 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6399 break;
6400 case AArch64::FMULv4f32:
6401 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6402 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6403 break;
6404 case AArch64::FMULv8f16:
6405 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6406 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6407 break;
6408 }
6409
6410 return Found;
6411 }
6412
getFNEGPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6413 static bool getFNEGPatterns(MachineInstr &Root,
6414 SmallVectorImpl<unsigned> &Patterns) {
6415 unsigned Opc = Root.getOpcode();
6416 MachineBasicBlock &MBB = *Root.getParent();
6417 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6418
6419 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6420 MachineOperand &MO = Root.getOperand(1);
6421 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg());
6422 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6423 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) &&
6424 Root.getFlag(MachineInstr::MIFlag::FmContract) &&
6425 Root.getFlag(MachineInstr::MIFlag::FmNsz) &&
6426 MI->getFlag(MachineInstr::MIFlag::FmContract) &&
6427 MI->getFlag(MachineInstr::MIFlag::FmNsz)) {
6428 Patterns.push_back(Pattern);
6429 return true;
6430 }
6431 return false;
6432 };
6433
6434 switch (Opc) {
6435 default:
6436 break;
6437 case AArch64::FNEGDr:
6438 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6439 case AArch64::FNEGSr:
6440 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6441 }
6442
6443 return false;
6444 }
6445
6446 /// Return true when a code sequence can improve throughput. It
6447 /// should be called only for instructions in loops.
6448 /// \param Pattern - combiner pattern
isThroughputPattern(unsigned Pattern) const6449 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6450 switch (Pattern) {
6451 default:
6452 break;
6453 case AArch64MachineCombinerPattern::FMULADDH_OP1:
6454 case AArch64MachineCombinerPattern::FMULADDH_OP2:
6455 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6456 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6457 case AArch64MachineCombinerPattern::FMULADDS_OP1:
6458 case AArch64MachineCombinerPattern::FMULADDS_OP2:
6459 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6460 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6461 case AArch64MachineCombinerPattern::FMULADDD_OP1:
6462 case AArch64MachineCombinerPattern::FMULADDD_OP2:
6463 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6464 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6465 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6466 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6467 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6468 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6469 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6470 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6471 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6472 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6473 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6474 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6475 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6476 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6477 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6478 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6479 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6480 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6481 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6482 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6483 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6484 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6485 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6486 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6487 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6488 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6489 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6490 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6491 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6492 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6493 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6494 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6495 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6496 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6497 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6498 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6499 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6500 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6501 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6502 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6503 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6504 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6505 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6506 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6507 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6508 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6509 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6510 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6511 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6512 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6513 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6514 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6515 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6516 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6517 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6518 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6519 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6520 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6521 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6522 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6523 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6524 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6525 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6526 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6527 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6528 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6529 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6530 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6531 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6532 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6533 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6534 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6535 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6536 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6537 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6538 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6539 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6540 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6541 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6542 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6543 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6544 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6545 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6546 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6547 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6548 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6549 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6550 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6551 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6552 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6553 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6554 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6555 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6556 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6557 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6558 return true;
6559 } // end switch (Pattern)
6560 return false;
6561 }
6562
6563 /// Find other MI combine patterns.
getMiscPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns)6564 static bool getMiscPatterns(MachineInstr &Root,
6565 SmallVectorImpl<unsigned> &Patterns) {
6566 // A - (B + C) ==> (A - B) - C or (A - C) - B
6567 unsigned Opc = Root.getOpcode();
6568 MachineBasicBlock &MBB = *Root.getParent();
6569
6570 switch (Opc) {
6571 case AArch64::SUBWrr:
6572 case AArch64::SUBSWrr:
6573 case AArch64::SUBXrr:
6574 case AArch64::SUBSXrr:
6575 // Found candidate root.
6576 break;
6577 default:
6578 return false;
6579 }
6580
6581 if (isCombineInstrSettingFlag(Opc) &&
6582 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
6583 -1)
6584 return false;
6585
6586 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6587 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6588 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6589 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6590 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1);
6591 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2);
6592 return true;
6593 }
6594
6595 return false;
6596 }
6597
6598 CombinerObjective
getCombinerObjective(unsigned Pattern) const6599 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6600 switch (Pattern) {
6601 case AArch64MachineCombinerPattern::SUBADD_OP1:
6602 case AArch64MachineCombinerPattern::SUBADD_OP2:
6603 return CombinerObjective::MustReduceDepth;
6604 default:
6605 return TargetInstrInfo::getCombinerObjective(Pattern);
6606 }
6607 }
6608
6609 /// Return true when there is potentially a faster code sequence for an
6610 /// instruction chain ending in \p Root. All potential patterns are listed in
6611 /// the \p Pattern vector. Pattern should be sorted in priority order since the
6612 /// pattern evaluator stops checking as soon as it finds a faster sequence.
6613
getMachineCombinerPatterns(MachineInstr & Root,SmallVectorImpl<unsigned> & Patterns,bool DoRegPressureReduce) const6614 bool AArch64InstrInfo::getMachineCombinerPatterns(
6615 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6616 bool DoRegPressureReduce) const {
6617 // Integer patterns
6618 if (getMaddPatterns(Root, Patterns))
6619 return true;
6620 // Floating point patterns
6621 if (getFMULPatterns(Root, Patterns))
6622 return true;
6623 if (getFMAPatterns(Root, Patterns))
6624 return true;
6625 if (getFNEGPatterns(Root, Patterns))
6626 return true;
6627
6628 // Other patterns
6629 if (getMiscPatterns(Root, Patterns))
6630 return true;
6631
6632 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6633 DoRegPressureReduce);
6634 }
6635
6636 enum class FMAInstKind { Default, Indexed, Accumulator };
6637 /// genFusedMultiply - Generate fused multiply instructions.
6638 /// This function supports both integer and floating point instructions.
6639 /// A typical example:
6640 /// F|MUL I=A,B,0
6641 /// F|ADD R,I,C
6642 /// ==> F|MADD R,A,B,C
6643 /// \param MF Containing MachineFunction
6644 /// \param MRI Register information
6645 /// \param TII Target information
6646 /// \param Root is the F|ADD instruction
6647 /// \param [out] InsInstrs is a vector of machine instructions and will
6648 /// contain the generated madd instruction
6649 /// \param IdxMulOpd is index of operand in Root that is the result of
6650 /// the F|MUL. In the example above IdxMulOpd is 1.
6651 /// \param MaddOpc the opcode fo the f|madd instruction
6652 /// \param RC Register class of operands
6653 /// \param kind of fma instruction (addressing mode) to be generated
6654 /// \param ReplacedAddend is the result register from the instruction
6655 /// replacing the non-combined operand, if any.
6656 static MachineInstr *
genFusedMultiply(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC,FMAInstKind kind=FMAInstKind::Default,const Register * ReplacedAddend=nullptr)6657 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6658 const TargetInstrInfo *TII, MachineInstr &Root,
6659 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6660 unsigned MaddOpc, const TargetRegisterClass *RC,
6661 FMAInstKind kind = FMAInstKind::Default,
6662 const Register *ReplacedAddend = nullptr) {
6663 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6664
6665 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6666 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6667 Register ResultReg = Root.getOperand(0).getReg();
6668 Register SrcReg0 = MUL->getOperand(1).getReg();
6669 bool Src0IsKill = MUL->getOperand(1).isKill();
6670 Register SrcReg1 = MUL->getOperand(2).getReg();
6671 bool Src1IsKill = MUL->getOperand(2).isKill();
6672
6673 Register SrcReg2;
6674 bool Src2IsKill;
6675 if (ReplacedAddend) {
6676 // If we just generated a new addend, we must be it's only use.
6677 SrcReg2 = *ReplacedAddend;
6678 Src2IsKill = true;
6679 } else {
6680 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
6681 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
6682 }
6683
6684 if (ResultReg.isVirtual())
6685 MRI.constrainRegClass(ResultReg, RC);
6686 if (SrcReg0.isVirtual())
6687 MRI.constrainRegClass(SrcReg0, RC);
6688 if (SrcReg1.isVirtual())
6689 MRI.constrainRegClass(SrcReg1, RC);
6690 if (SrcReg2.isVirtual())
6691 MRI.constrainRegClass(SrcReg2, RC);
6692
6693 MachineInstrBuilder MIB;
6694 if (kind == FMAInstKind::Default)
6695 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6696 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6697 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6698 .addReg(SrcReg2, getKillRegState(Src2IsKill));
6699 else if (kind == FMAInstKind::Indexed)
6700 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6701 .addReg(SrcReg2, getKillRegState(Src2IsKill))
6702 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6703 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6704 .addImm(MUL->getOperand(3).getImm());
6705 else if (kind == FMAInstKind::Accumulator)
6706 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6707 .addReg(SrcReg2, getKillRegState(Src2IsKill))
6708 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6709 .addReg(SrcReg1, getKillRegState(Src1IsKill));
6710 else
6711 assert(false && "Invalid FMA instruction kind \n");
6712 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6713 InsInstrs.push_back(MIB);
6714 return MUL;
6715 }
6716
6717 static MachineInstr *
genFNegatedMAD(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs)6718 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6719 const TargetInstrInfo *TII, MachineInstr &Root,
6720 SmallVectorImpl<MachineInstr *> &InsInstrs) {
6721 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
6722
6723 unsigned Opc = 0;
6724 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg());
6725 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6726 Opc = AArch64::FNMADDSrrr;
6727 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6728 Opc = AArch64::FNMADDDrrr;
6729 else
6730 return nullptr;
6731
6732 Register ResultReg = Root.getOperand(0).getReg();
6733 Register SrcReg0 = MAD->getOperand(1).getReg();
6734 Register SrcReg1 = MAD->getOperand(2).getReg();
6735 Register SrcReg2 = MAD->getOperand(3).getReg();
6736 bool Src0IsKill = MAD->getOperand(1).isKill();
6737 bool Src1IsKill = MAD->getOperand(2).isKill();
6738 bool Src2IsKill = MAD->getOperand(3).isKill();
6739 if (ResultReg.isVirtual())
6740 MRI.constrainRegClass(ResultReg, RC);
6741 if (SrcReg0.isVirtual())
6742 MRI.constrainRegClass(SrcReg0, RC);
6743 if (SrcReg1.isVirtual())
6744 MRI.constrainRegClass(SrcReg1, RC);
6745 if (SrcReg2.isVirtual())
6746 MRI.constrainRegClass(SrcReg2, RC);
6747
6748 MachineInstrBuilder MIB =
6749 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg)
6750 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6751 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6752 .addReg(SrcReg2, getKillRegState(Src2IsKill));
6753 InsInstrs.push_back(MIB);
6754
6755 return MAD;
6756 }
6757
6758 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6759 static MachineInstr *
genIndexedMultiply(MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxDupOp,unsigned MulOpc,const TargetRegisterClass * RC,MachineRegisterInfo & MRI)6760 genIndexedMultiply(MachineInstr &Root,
6761 SmallVectorImpl<MachineInstr *> &InsInstrs,
6762 unsigned IdxDupOp, unsigned MulOpc,
6763 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6764 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6765 "Invalid index of FMUL operand");
6766
6767 MachineFunction &MF = *Root.getMF();
6768 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6769
6770 MachineInstr *Dup =
6771 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
6772
6773 if (Dup->getOpcode() == TargetOpcode::COPY)
6774 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg());
6775
6776 Register DupSrcReg = Dup->getOperand(1).getReg();
6777 MRI.clearKillFlags(DupSrcReg);
6778 MRI.constrainRegClass(DupSrcReg, RC);
6779
6780 unsigned DupSrcLane = Dup->getOperand(2).getImm();
6781
6782 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6783 MachineOperand &MulOp = Root.getOperand(IdxMulOp);
6784
6785 Register ResultReg = Root.getOperand(0).getReg();
6786
6787 MachineInstrBuilder MIB;
6788 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg)
6789 .add(MulOp)
6790 .addReg(DupSrcReg)
6791 .addImm(DupSrcLane);
6792
6793 InsInstrs.push_back(MIB);
6794 return &Root;
6795 }
6796
6797 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6798 /// instructions.
6799 ///
6800 /// \see genFusedMultiply
genFusedMultiplyAcc(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6801 static MachineInstr *genFusedMultiplyAcc(
6802 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6803 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6804 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6805 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6806 FMAInstKind::Accumulator);
6807 }
6808
6809 /// genNeg - Helper to generate an intermediate negation of the second operand
6810 /// of Root
genNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned MnegOpc,const TargetRegisterClass * RC)6811 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6812 const TargetInstrInfo *TII, MachineInstr &Root,
6813 SmallVectorImpl<MachineInstr *> &InsInstrs,
6814 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6815 unsigned MnegOpc, const TargetRegisterClass *RC) {
6816 Register NewVR = MRI.createVirtualRegister(RC);
6817 MachineInstrBuilder MIB =
6818 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR)
6819 .add(Root.getOperand(2));
6820 InsInstrs.push_back(MIB);
6821
6822 assert(InstrIdxForVirtReg.empty());
6823 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6824
6825 return NewVR;
6826 }
6827
6828 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6829 /// instructions with an additional negation of the accumulator
genFusedMultiplyAccNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6830 static MachineInstr *genFusedMultiplyAccNeg(
6831 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6832 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6833 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6834 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6835 assert(IdxMulOpd == 1);
6836
6837 Register NewVR =
6838 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6839 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840 FMAInstKind::Accumulator, &NewVR);
6841 }
6842
6843 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6844 /// instructions.
6845 ///
6846 /// \see genFusedMultiply
genFusedMultiplyIdx(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,const TargetRegisterClass * RC)6847 static MachineInstr *genFusedMultiplyIdx(
6848 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6849 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6850 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6851 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6852 FMAInstKind::Indexed);
6853 }
6854
6855 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6856 /// instructions with an additional negation of the accumulator
genFusedMultiplyIdxNeg(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg,unsigned IdxMulOpd,unsigned MaddOpc,unsigned MnegOpc,const TargetRegisterClass * RC)6857 static MachineInstr *genFusedMultiplyIdxNeg(
6858 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6859 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6860 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6861 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6862 assert(IdxMulOpd == 1);
6863
6864 Register NewVR =
6865 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6866
6867 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6868 FMAInstKind::Indexed, &NewVR);
6869 }
6870
6871 /// genMaddR - Generate madd instruction and combine mul and add using
6872 /// an extra virtual register
6873 /// Example - an ADD intermediate needs to be stored in a register:
6874 /// MUL I=A,B,0
6875 /// ADD R,I,Imm
6876 /// ==> ORR V, ZR, Imm
6877 /// ==> MADD R,A,B,V
6878 /// \param MF Containing MachineFunction
6879 /// \param MRI Register information
6880 /// \param TII Target information
6881 /// \param Root is the ADD instruction
6882 /// \param [out] InsInstrs is a vector of machine instructions and will
6883 /// contain the generated madd instruction
6884 /// \param IdxMulOpd is index of operand in Root that is the result of
6885 /// the MUL. In the example above IdxMulOpd is 1.
6886 /// \param MaddOpc the opcode fo the madd instruction
6887 /// \param VR is a virtual register that holds the value of an ADD operand
6888 /// (V in the example above).
6889 /// \param RC Register class of operands
genMaddR(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,unsigned IdxMulOpd,unsigned MaddOpc,unsigned VR,const TargetRegisterClass * RC)6890 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6891 const TargetInstrInfo *TII, MachineInstr &Root,
6892 SmallVectorImpl<MachineInstr *> &InsInstrs,
6893 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6894 const TargetRegisterClass *RC) {
6895 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6896
6897 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
6898 Register ResultReg = Root.getOperand(0).getReg();
6899 Register SrcReg0 = MUL->getOperand(1).getReg();
6900 bool Src0IsKill = MUL->getOperand(1).isKill();
6901 Register SrcReg1 = MUL->getOperand(2).getReg();
6902 bool Src1IsKill = MUL->getOperand(2).isKill();
6903
6904 if (ResultReg.isVirtual())
6905 MRI.constrainRegClass(ResultReg, RC);
6906 if (SrcReg0.isVirtual())
6907 MRI.constrainRegClass(SrcReg0, RC);
6908 if (SrcReg1.isVirtual())
6909 MRI.constrainRegClass(SrcReg1, RC);
6910 if (Register::isVirtualRegister(VR))
6911 MRI.constrainRegClass(VR, RC);
6912
6913 MachineInstrBuilder MIB =
6914 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg)
6915 .addReg(SrcReg0, getKillRegState(Src0IsKill))
6916 .addReg(SrcReg1, getKillRegState(Src1IsKill))
6917 .addReg(VR);
6918 // Insert the MADD
6919 InsInstrs.push_back(MIB);
6920 return MUL;
6921 }
6922
6923 /// Do the following transformation
6924 /// A - (B + C) ==> (A - B) - C
6925 /// A - (B + C) ==> (A - C) - B
6926 static void
genSubAdd2SubSub(MachineFunction & MF,MachineRegisterInfo & MRI,const TargetInstrInfo * TII,MachineInstr & Root,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,unsigned IdxOpd1,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg)6927 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6928 const TargetInstrInfo *TII, MachineInstr &Root,
6929 SmallVectorImpl<MachineInstr *> &InsInstrs,
6930 SmallVectorImpl<MachineInstr *> &DelInstrs,
6931 unsigned IdxOpd1,
6932 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6933 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6934 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6935 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
6936
6937 Register ResultReg = Root.getOperand(0).getReg();
6938 Register RegA = Root.getOperand(1).getReg();
6939 bool RegAIsKill = Root.getOperand(1).isKill();
6940 Register RegB = AddMI->getOperand(IdxOpd1).getReg();
6941 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
6942 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
6943 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
6944 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));
6945
6946 unsigned Opcode = Root.getOpcode();
6947 if (Opcode == AArch64::SUBSWrr)
6948 Opcode = AArch64::SUBWrr;
6949 else if (Opcode == AArch64::SUBSXrr)
6950 Opcode = AArch64::SUBXrr;
6951 else
6952 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6953 "Unexpected instruction opcode.");
6954
6955 uint32_t Flags = Root.mergeFlagsWith(*AddMI);
6956 Flags &= ~MachineInstr::NoSWrap;
6957 Flags &= ~MachineInstr::NoUWrap;
6958
6959 MachineInstrBuilder MIB1 =
6960 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR)
6961 .addReg(RegA, getKillRegState(RegAIsKill))
6962 .addReg(RegB, getKillRegState(RegBIsKill))
6963 .setMIFlags(Flags);
6964 MachineInstrBuilder MIB2 =
6965 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg)
6966 .addReg(NewVR, getKillRegState(true))
6967 .addReg(RegC, getKillRegState(RegCIsKill))
6968 .setMIFlags(Flags);
6969
6970 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
6971 InsInstrs.push_back(MIB1);
6972 InsInstrs.push_back(MIB2);
6973 DelInstrs.push_back(AddMI);
6974 DelInstrs.push_back(&Root);
6975 }
6976
6977 /// When getMachineCombinerPatterns() finds potential patterns,
6978 /// this function generates the instructions that could replace the
6979 /// original code sequence
genAlternativeCodeSequence(MachineInstr & Root,unsigned Pattern,SmallVectorImpl<MachineInstr * > & InsInstrs,SmallVectorImpl<MachineInstr * > & DelInstrs,DenseMap<unsigned,unsigned> & InstrIdxForVirtReg) const6980 void AArch64InstrInfo::genAlternativeCodeSequence(
6981 MachineInstr &Root, unsigned Pattern,
6982 SmallVectorImpl<MachineInstr *> &InsInstrs,
6983 SmallVectorImpl<MachineInstr *> &DelInstrs,
6984 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6985 MachineBasicBlock &MBB = *Root.getParent();
6986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6987 MachineFunction &MF = *MBB.getParent();
6988 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6989
6990 MachineInstr *MUL = nullptr;
6991 const TargetRegisterClass *RC;
6992 unsigned Opc;
6993 switch (Pattern) {
6994 default:
6995 // Reassociate instructions.
6996 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6997 DelInstrs, InstrIdxForVirtReg);
6998 return;
6999 case AArch64MachineCombinerPattern::SUBADD_OP1:
7000 // A - (B + C)
7001 // ==> (A - B) - C
7002 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
7003 InstrIdxForVirtReg);
7004 return;
7005 case AArch64MachineCombinerPattern::SUBADD_OP2:
7006 // A - (B + C)
7007 // ==> (A - C) - B
7008 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
7009 InstrIdxForVirtReg);
7010 return;
7011 case AArch64MachineCombinerPattern::MULADDW_OP1:
7012 case AArch64MachineCombinerPattern::MULADDX_OP1:
7013 // MUL I=A,B,0
7014 // ADD R,I,C
7015 // ==> MADD R,A,B,C
7016 // --- Create(MADD);
7017 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
7018 Opc = AArch64::MADDWrrr;
7019 RC = &AArch64::GPR32RegClass;
7020 } else {
7021 Opc = AArch64::MADDXrrr;
7022 RC = &AArch64::GPR64RegClass;
7023 }
7024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7025 break;
7026 case AArch64MachineCombinerPattern::MULADDW_OP2:
7027 case AArch64MachineCombinerPattern::MULADDX_OP2:
7028 // MUL I=A,B,0
7029 // ADD R,C,I
7030 // ==> MADD R,A,B,C
7031 // --- Create(MADD);
7032 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
7033 Opc = AArch64::MADDWrrr;
7034 RC = &AArch64::GPR32RegClass;
7035 } else {
7036 Opc = AArch64::MADDXrrr;
7037 RC = &AArch64::GPR64RegClass;
7038 }
7039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7040 break;
7041 case AArch64MachineCombinerPattern::MULADDWI_OP1:
7042 case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7043 // MUL I=A,B,0
7044 // ADD R,I,Imm
7045 // ==> MOV V, Imm
7046 // ==> MADD R,A,B,V
7047 // --- Create(MADD);
7048 const TargetRegisterClass *OrrRC;
7049 unsigned BitSize, OrrOpc, ZeroReg;
7050 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7051 OrrOpc = AArch64::ORRWri;
7052 OrrRC = &AArch64::GPR32spRegClass;
7053 BitSize = 32;
7054 ZeroReg = AArch64::WZR;
7055 Opc = AArch64::MADDWrrr;
7056 RC = &AArch64::GPR32RegClass;
7057 } else {
7058 OrrOpc = AArch64::ORRXri;
7059 OrrRC = &AArch64::GPR64spRegClass;
7060 BitSize = 64;
7061 ZeroReg = AArch64::XZR;
7062 Opc = AArch64::MADDXrrr;
7063 RC = &AArch64::GPR64RegClass;
7064 }
7065 Register NewVR = MRI.createVirtualRegister(OrrRC);
7066 uint64_t Imm = Root.getOperand(2).getImm();
7067
7068 if (Root.getOperand(3).isImm()) {
7069 unsigned Val = Root.getOperand(3).getImm();
7070 Imm = Imm << Val;
7071 }
7072 uint64_t UImm = SignExtend64(Imm, BitSize);
7073 // The immediate can be composed via a single instruction.
7074 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7075 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7076 if (Insn.size() != 1)
7077 return;
7078 auto MovI = Insn.begin();
7079 MachineInstrBuilder MIB1;
7080 // MOV is an alias for one of three instructions: movz, movn, and orr.
7081 if (MovI->Opcode == OrrOpc)
7082 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7083 .addReg(ZeroReg)
7084 .addImm(MovI->Op2);
7085 else {
7086 if (BitSize == 32)
7087 assert((MovI->Opcode == AArch64::MOVNWi ||
7088 MovI->Opcode == AArch64::MOVZWi) &&
7089 "Expected opcode");
7090 else
7091 assert((MovI->Opcode == AArch64::MOVNXi ||
7092 MovI->Opcode == AArch64::MOVZXi) &&
7093 "Expected opcode");
7094 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7095 .addImm(MovI->Op1)
7096 .addImm(MovI->Op2);
7097 }
7098 InsInstrs.push_back(MIB1);
7099 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7100 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7101 break;
7102 }
7103 case AArch64MachineCombinerPattern::MULSUBW_OP1:
7104 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7105 // MUL I=A,B,0
7106 // SUB R,I, C
7107 // ==> SUB V, 0, C
7108 // ==> MADD R,A,B,V // = -C + A*B
7109 // --- Create(MADD);
7110 const TargetRegisterClass *SubRC;
7111 unsigned SubOpc, ZeroReg;
7112 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7113 SubOpc = AArch64::SUBWrr;
7114 SubRC = &AArch64::GPR32spRegClass;
7115 ZeroReg = AArch64::WZR;
7116 Opc = AArch64::MADDWrrr;
7117 RC = &AArch64::GPR32RegClass;
7118 } else {
7119 SubOpc = AArch64::SUBXrr;
7120 SubRC = &AArch64::GPR64spRegClass;
7121 ZeroReg = AArch64::XZR;
7122 Opc = AArch64::MADDXrrr;
7123 RC = &AArch64::GPR64RegClass;
7124 }
7125 Register NewVR = MRI.createVirtualRegister(SubRC);
7126 // SUB NewVR, 0, C
7127 MachineInstrBuilder MIB1 =
7128 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR)
7129 .addReg(ZeroReg)
7130 .add(Root.getOperand(2));
7131 InsInstrs.push_back(MIB1);
7132 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7133 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7134 break;
7135 }
7136 case AArch64MachineCombinerPattern::MULSUBW_OP2:
7137 case AArch64MachineCombinerPattern::MULSUBX_OP2:
7138 // MUL I=A,B,0
7139 // SUB R,C,I
7140 // ==> MSUB R,A,B,C (computes C - A*B)
7141 // --- Create(MSUB);
7142 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7143 Opc = AArch64::MSUBWrrr;
7144 RC = &AArch64::GPR32RegClass;
7145 } else {
7146 Opc = AArch64::MSUBXrrr;
7147 RC = &AArch64::GPR64RegClass;
7148 }
7149 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7150 break;
7151 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7152 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7153 // MUL I=A,B,0
7154 // SUB R,I, Imm
7155 // ==> MOV V, -Imm
7156 // ==> MADD R,A,B,V // = -Imm + A*B
7157 // --- Create(MADD);
7158 const TargetRegisterClass *OrrRC;
7159 unsigned BitSize, OrrOpc, ZeroReg;
7160 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7161 OrrOpc = AArch64::ORRWri;
7162 OrrRC = &AArch64::GPR32spRegClass;
7163 BitSize = 32;
7164 ZeroReg = AArch64::WZR;
7165 Opc = AArch64::MADDWrrr;
7166 RC = &AArch64::GPR32RegClass;
7167 } else {
7168 OrrOpc = AArch64::ORRXri;
7169 OrrRC = &AArch64::GPR64spRegClass;
7170 BitSize = 64;
7171 ZeroReg = AArch64::XZR;
7172 Opc = AArch64::MADDXrrr;
7173 RC = &AArch64::GPR64RegClass;
7174 }
7175 Register NewVR = MRI.createVirtualRegister(OrrRC);
7176 uint64_t Imm = Root.getOperand(2).getImm();
7177 if (Root.getOperand(3).isImm()) {
7178 unsigned Val = Root.getOperand(3).getImm();
7179 Imm = Imm << Val;
7180 }
7181 uint64_t UImm = SignExtend64(-Imm, BitSize);
7182 // The immediate can be composed via a single instruction.
7183 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7184 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
7185 if (Insn.size() != 1)
7186 return;
7187 auto MovI = Insn.begin();
7188 MachineInstrBuilder MIB1;
7189 // MOV is an alias for one of three instructions: movz, movn, and orr.
7190 if (MovI->Opcode == OrrOpc)
7191 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
7192 .addReg(ZeroReg)
7193 .addImm(MovI->Op2);
7194 else {
7195 if (BitSize == 32)
7196 assert((MovI->Opcode == AArch64::MOVNWi ||
7197 MovI->Opcode == AArch64::MOVZWi) &&
7198 "Expected opcode");
7199 else
7200 assert((MovI->Opcode == AArch64::MOVNXi ||
7201 MovI->Opcode == AArch64::MOVZXi) &&
7202 "Expected opcode");
7203 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
7204 .addImm(MovI->Op1)
7205 .addImm(MovI->Op2);
7206 }
7207 InsInstrs.push_back(MIB1);
7208 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7209 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
7210 break;
7211 }
7212
7213 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7214 Opc = AArch64::MLAv8i8;
7215 RC = &AArch64::FPR64RegClass;
7216 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7217 break;
7218 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7219 Opc = AArch64::MLAv8i8;
7220 RC = &AArch64::FPR64RegClass;
7221 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7222 break;
7223 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7224 Opc = AArch64::MLAv16i8;
7225 RC = &AArch64::FPR128RegClass;
7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7227 break;
7228 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7229 Opc = AArch64::MLAv16i8;
7230 RC = &AArch64::FPR128RegClass;
7231 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7232 break;
7233 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7234 Opc = AArch64::MLAv4i16;
7235 RC = &AArch64::FPR64RegClass;
7236 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7237 break;
7238 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7239 Opc = AArch64::MLAv4i16;
7240 RC = &AArch64::FPR64RegClass;
7241 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7242 break;
7243 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7244 Opc = AArch64::MLAv8i16;
7245 RC = &AArch64::FPR128RegClass;
7246 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7247 break;
7248 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7249 Opc = AArch64::MLAv8i16;
7250 RC = &AArch64::FPR128RegClass;
7251 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7252 break;
7253 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7254 Opc = AArch64::MLAv2i32;
7255 RC = &AArch64::FPR64RegClass;
7256 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7257 break;
7258 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7259 Opc = AArch64::MLAv2i32;
7260 RC = &AArch64::FPR64RegClass;
7261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7262 break;
7263 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7264 Opc = AArch64::MLAv4i32;
7265 RC = &AArch64::FPR128RegClass;
7266 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7267 break;
7268 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7269 Opc = AArch64::MLAv4i32;
7270 RC = &AArch64::FPR128RegClass;
7271 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7272 break;
7273
7274 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7275 Opc = AArch64::MLAv8i8;
7276 RC = &AArch64::FPR64RegClass;
7277 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7278 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7279 RC);
7280 break;
7281 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7282 Opc = AArch64::MLSv8i8;
7283 RC = &AArch64::FPR64RegClass;
7284 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7285 break;
7286 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7287 Opc = AArch64::MLAv16i8;
7288 RC = &AArch64::FPR128RegClass;
7289 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7290 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7291 RC);
7292 break;
7293 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7294 Opc = AArch64::MLSv16i8;
7295 RC = &AArch64::FPR128RegClass;
7296 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7297 break;
7298 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7299 Opc = AArch64::MLAv4i16;
7300 RC = &AArch64::FPR64RegClass;
7301 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7302 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7303 RC);
7304 break;
7305 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7306 Opc = AArch64::MLSv4i16;
7307 RC = &AArch64::FPR64RegClass;
7308 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7309 break;
7310 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7311 Opc = AArch64::MLAv8i16;
7312 RC = &AArch64::FPR128RegClass;
7313 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7314 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7315 RC);
7316 break;
7317 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7318 Opc = AArch64::MLSv8i16;
7319 RC = &AArch64::FPR128RegClass;
7320 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7321 break;
7322 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7323 Opc = AArch64::MLAv2i32;
7324 RC = &AArch64::FPR64RegClass;
7325 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7326 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7327 RC);
7328 break;
7329 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7330 Opc = AArch64::MLSv2i32;
7331 RC = &AArch64::FPR64RegClass;
7332 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7333 break;
7334 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7335 Opc = AArch64::MLAv4i32;
7336 RC = &AArch64::FPR128RegClass;
7337 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7338 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7339 RC);
7340 break;
7341 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7342 Opc = AArch64::MLSv4i32;
7343 RC = &AArch64::FPR128RegClass;
7344 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7345 break;
7346
7347 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7348 Opc = AArch64::MLAv4i16_indexed;
7349 RC = &AArch64::FPR64RegClass;
7350 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7351 break;
7352 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7353 Opc = AArch64::MLAv4i16_indexed;
7354 RC = &AArch64::FPR64RegClass;
7355 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7356 break;
7357 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7358 Opc = AArch64::MLAv8i16_indexed;
7359 RC = &AArch64::FPR128RegClass;
7360 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7361 break;
7362 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7363 Opc = AArch64::MLAv8i16_indexed;
7364 RC = &AArch64::FPR128RegClass;
7365 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7366 break;
7367 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7368 Opc = AArch64::MLAv2i32_indexed;
7369 RC = &AArch64::FPR64RegClass;
7370 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7371 break;
7372 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7373 Opc = AArch64::MLAv2i32_indexed;
7374 RC = &AArch64::FPR64RegClass;
7375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7376 break;
7377 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7378 Opc = AArch64::MLAv4i32_indexed;
7379 RC = &AArch64::FPR128RegClass;
7380 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7381 break;
7382 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7383 Opc = AArch64::MLAv4i32_indexed;
7384 RC = &AArch64::FPR128RegClass;
7385 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7386 break;
7387
7388 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7389 Opc = AArch64::MLAv4i16_indexed;
7390 RC = &AArch64::FPR64RegClass;
7391 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7392 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7393 RC);
7394 break;
7395 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7396 Opc = AArch64::MLSv4i16_indexed;
7397 RC = &AArch64::FPR64RegClass;
7398 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7399 break;
7400 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7401 Opc = AArch64::MLAv8i16_indexed;
7402 RC = &AArch64::FPR128RegClass;
7403 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7404 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7405 RC);
7406 break;
7407 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7408 Opc = AArch64::MLSv8i16_indexed;
7409 RC = &AArch64::FPR128RegClass;
7410 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7411 break;
7412 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7413 Opc = AArch64::MLAv2i32_indexed;
7414 RC = &AArch64::FPR64RegClass;
7415 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7416 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7417 RC);
7418 break;
7419 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7420 Opc = AArch64::MLSv2i32_indexed;
7421 RC = &AArch64::FPR64RegClass;
7422 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7423 break;
7424 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7425 Opc = AArch64::MLAv4i32_indexed;
7426 RC = &AArch64::FPR128RegClass;
7427 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7428 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7429 RC);
7430 break;
7431 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7432 Opc = AArch64::MLSv4i32_indexed;
7433 RC = &AArch64::FPR128RegClass;
7434 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7435 break;
7436
7437 // Floating Point Support
7438 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7439 Opc = AArch64::FMADDHrrr;
7440 RC = &AArch64::FPR16RegClass;
7441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7442 break;
7443 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7444 Opc = AArch64::FMADDSrrr;
7445 RC = &AArch64::FPR32RegClass;
7446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7447 break;
7448 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7449 Opc = AArch64::FMADDDrrr;
7450 RC = &AArch64::FPR64RegClass;
7451 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7452 break;
7453
7454 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7455 Opc = AArch64::FMADDHrrr;
7456 RC = &AArch64::FPR16RegClass;
7457 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7458 break;
7459 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7460 Opc = AArch64::FMADDSrrr;
7461 RC = &AArch64::FPR32RegClass;
7462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7463 break;
7464 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7465 Opc = AArch64::FMADDDrrr;
7466 RC = &AArch64::FPR64RegClass;
7467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7468 break;
7469
7470 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7471 Opc = AArch64::FMLAv1i32_indexed;
7472 RC = &AArch64::FPR32RegClass;
7473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7474 FMAInstKind::Indexed);
7475 break;
7476 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7477 Opc = AArch64::FMLAv1i32_indexed;
7478 RC = &AArch64::FPR32RegClass;
7479 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7480 FMAInstKind::Indexed);
7481 break;
7482
7483 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7484 Opc = AArch64::FMLAv1i64_indexed;
7485 RC = &AArch64::FPR64RegClass;
7486 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7487 FMAInstKind::Indexed);
7488 break;
7489 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7490 Opc = AArch64::FMLAv1i64_indexed;
7491 RC = &AArch64::FPR64RegClass;
7492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7493 FMAInstKind::Indexed);
7494 break;
7495
7496 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7497 RC = &AArch64::FPR64RegClass;
7498 Opc = AArch64::FMLAv4i16_indexed;
7499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7500 FMAInstKind::Indexed);
7501 break;
7502 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7503 RC = &AArch64::FPR64RegClass;
7504 Opc = AArch64::FMLAv4f16;
7505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7506 FMAInstKind::Accumulator);
7507 break;
7508 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7509 RC = &AArch64::FPR64RegClass;
7510 Opc = AArch64::FMLAv4i16_indexed;
7511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7512 FMAInstKind::Indexed);
7513 break;
7514 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7515 RC = &AArch64::FPR64RegClass;
7516 Opc = AArch64::FMLAv4f16;
7517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7518 FMAInstKind::Accumulator);
7519 break;
7520
7521 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7522 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7523 RC = &AArch64::FPR64RegClass;
7524 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7525 Opc = AArch64::FMLAv2i32_indexed;
7526 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7527 FMAInstKind::Indexed);
7528 } else {
7529 Opc = AArch64::FMLAv2f32;
7530 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7531 FMAInstKind::Accumulator);
7532 }
7533 break;
7534 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7535 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7536 RC = &AArch64::FPR64RegClass;
7537 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7538 Opc = AArch64::FMLAv2i32_indexed;
7539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7540 FMAInstKind::Indexed);
7541 } else {
7542 Opc = AArch64::FMLAv2f32;
7543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7544 FMAInstKind::Accumulator);
7545 }
7546 break;
7547
7548 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7549 RC = &AArch64::FPR128RegClass;
7550 Opc = AArch64::FMLAv8i16_indexed;
7551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7552 FMAInstKind::Indexed);
7553 break;
7554 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7555 RC = &AArch64::FPR128RegClass;
7556 Opc = AArch64::FMLAv8f16;
7557 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7558 FMAInstKind::Accumulator);
7559 break;
7560 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7561 RC = &AArch64::FPR128RegClass;
7562 Opc = AArch64::FMLAv8i16_indexed;
7563 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7564 FMAInstKind::Indexed);
7565 break;
7566 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7567 RC = &AArch64::FPR128RegClass;
7568 Opc = AArch64::FMLAv8f16;
7569 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7570 FMAInstKind::Accumulator);
7571 break;
7572
7573 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7574 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7575 RC = &AArch64::FPR128RegClass;
7576 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7577 Opc = AArch64::FMLAv2i64_indexed;
7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7579 FMAInstKind::Indexed);
7580 } else {
7581 Opc = AArch64::FMLAv2f64;
7582 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7583 FMAInstKind::Accumulator);
7584 }
7585 break;
7586 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7587 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7588 RC = &AArch64::FPR128RegClass;
7589 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7590 Opc = AArch64::FMLAv2i64_indexed;
7591 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7592 FMAInstKind::Indexed);
7593 } else {
7594 Opc = AArch64::FMLAv2f64;
7595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7596 FMAInstKind::Accumulator);
7597 }
7598 break;
7599
7600 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7601 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7602 RC = &AArch64::FPR128RegClass;
7603 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7604 Opc = AArch64::FMLAv4i32_indexed;
7605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7606 FMAInstKind::Indexed);
7607 } else {
7608 Opc = AArch64::FMLAv4f32;
7609 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7610 FMAInstKind::Accumulator);
7611 }
7612 break;
7613
7614 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7615 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7616 RC = &AArch64::FPR128RegClass;
7617 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7618 Opc = AArch64::FMLAv4i32_indexed;
7619 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7620 FMAInstKind::Indexed);
7621 } else {
7622 Opc = AArch64::FMLAv4f32;
7623 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7624 FMAInstKind::Accumulator);
7625 }
7626 break;
7627
7628 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7629 Opc = AArch64::FNMSUBHrrr;
7630 RC = &AArch64::FPR16RegClass;
7631 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7632 break;
7633 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7634 Opc = AArch64::FNMSUBSrrr;
7635 RC = &AArch64::FPR32RegClass;
7636 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7637 break;
7638 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7639 Opc = AArch64::FNMSUBDrrr;
7640 RC = &AArch64::FPR64RegClass;
7641 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7642 break;
7643
7644 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7645 Opc = AArch64::FNMADDHrrr;
7646 RC = &AArch64::FPR16RegClass;
7647 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7648 break;
7649 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7650 Opc = AArch64::FNMADDSrrr;
7651 RC = &AArch64::FPR32RegClass;
7652 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7653 break;
7654 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7655 Opc = AArch64::FNMADDDrrr;
7656 RC = &AArch64::FPR64RegClass;
7657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
7658 break;
7659
7660 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7661 Opc = AArch64::FMSUBHrrr;
7662 RC = &AArch64::FPR16RegClass;
7663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7664 break;
7665 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7666 Opc = AArch64::FMSUBSrrr;
7667 RC = &AArch64::FPR32RegClass;
7668 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7669 break;
7670 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7671 Opc = AArch64::FMSUBDrrr;
7672 RC = &AArch64::FPR64RegClass;
7673 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
7674 break;
7675
7676 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7677 Opc = AArch64::FMLSv1i32_indexed;
7678 RC = &AArch64::FPR32RegClass;
7679 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7680 FMAInstKind::Indexed);
7681 break;
7682
7683 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7684 Opc = AArch64::FMLSv1i64_indexed;
7685 RC = &AArch64::FPR64RegClass;
7686 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7687 FMAInstKind::Indexed);
7688 break;
7689
7690 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7691 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7692 RC = &AArch64::FPR64RegClass;
7693 Register NewVR = MRI.createVirtualRegister(RC);
7694 MachineInstrBuilder MIB1 =
7695 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7696 .add(Root.getOperand(2));
7697 InsInstrs.push_back(MIB1);
7698 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7699 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7700 Opc = AArch64::FMLAv4f16;
7701 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7702 FMAInstKind::Accumulator, &NewVR);
7703 } else {
7704 Opc = AArch64::FMLAv4i16_indexed;
7705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7706 FMAInstKind::Indexed, &NewVR);
7707 }
7708 break;
7709 }
7710 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7711 RC = &AArch64::FPR64RegClass;
7712 Opc = AArch64::FMLSv4f16;
7713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7714 FMAInstKind::Accumulator);
7715 break;
7716 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7717 RC = &AArch64::FPR64RegClass;
7718 Opc = AArch64::FMLSv4i16_indexed;
7719 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7720 FMAInstKind::Indexed);
7721 break;
7722
7723 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7724 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7725 RC = &AArch64::FPR64RegClass;
7726 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7727 Opc = AArch64::FMLSv2i32_indexed;
7728 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7729 FMAInstKind::Indexed);
7730 } else {
7731 Opc = AArch64::FMLSv2f32;
7732 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7733 FMAInstKind::Accumulator);
7734 }
7735 break;
7736
7737 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7738 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7739 RC = &AArch64::FPR128RegClass;
7740 Register NewVR = MRI.createVirtualRegister(RC);
7741 MachineInstrBuilder MIB1 =
7742 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7743 .add(Root.getOperand(2));
7744 InsInstrs.push_back(MIB1);
7745 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7746 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7747 Opc = AArch64::FMLAv8f16;
7748 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7749 FMAInstKind::Accumulator, &NewVR);
7750 } else {
7751 Opc = AArch64::FMLAv8i16_indexed;
7752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7753 FMAInstKind::Indexed, &NewVR);
7754 }
7755 break;
7756 }
7757 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7758 RC = &AArch64::FPR128RegClass;
7759 Opc = AArch64::FMLSv8f16;
7760 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7761 FMAInstKind::Accumulator);
7762 break;
7763 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7764 RC = &AArch64::FPR128RegClass;
7765 Opc = AArch64::FMLSv8i16_indexed;
7766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7767 FMAInstKind::Indexed);
7768 break;
7769
7770 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7771 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7772 RC = &AArch64::FPR128RegClass;
7773 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7774 Opc = AArch64::FMLSv2i64_indexed;
7775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7776 FMAInstKind::Indexed);
7777 } else {
7778 Opc = AArch64::FMLSv2f64;
7779 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7780 FMAInstKind::Accumulator);
7781 }
7782 break;
7783
7784 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7785 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7786 RC = &AArch64::FPR128RegClass;
7787 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7788 Opc = AArch64::FMLSv4i32_indexed;
7789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7790 FMAInstKind::Indexed);
7791 } else {
7792 Opc = AArch64::FMLSv4f32;
7793 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
7794 FMAInstKind::Accumulator);
7795 }
7796 break;
7797 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7798 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7799 RC = &AArch64::FPR64RegClass;
7800 Register NewVR = MRI.createVirtualRegister(RC);
7801 MachineInstrBuilder MIB1 =
7802 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7803 .add(Root.getOperand(2));
7804 InsInstrs.push_back(MIB1);
7805 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7806 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7807 Opc = AArch64::FMLAv2i32_indexed;
7808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7809 FMAInstKind::Indexed, &NewVR);
7810 } else {
7811 Opc = AArch64::FMLAv2f32;
7812 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7813 FMAInstKind::Accumulator, &NewVR);
7814 }
7815 break;
7816 }
7817 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7818 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7819 RC = &AArch64::FPR128RegClass;
7820 Register NewVR = MRI.createVirtualRegister(RC);
7821 MachineInstrBuilder MIB1 =
7822 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7823 .add(Root.getOperand(2));
7824 InsInstrs.push_back(MIB1);
7825 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7826 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7827 Opc = AArch64::FMLAv4i32_indexed;
7828 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7829 FMAInstKind::Indexed, &NewVR);
7830 } else {
7831 Opc = AArch64::FMLAv4f32;
7832 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7833 FMAInstKind::Accumulator, &NewVR);
7834 }
7835 break;
7836 }
7837 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7838 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7839 RC = &AArch64::FPR128RegClass;
7840 Register NewVR = MRI.createVirtualRegister(RC);
7841 MachineInstrBuilder MIB1 =
7842 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7843 .add(Root.getOperand(2));
7844 InsInstrs.push_back(MIB1);
7845 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
7846 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7847 Opc = AArch64::FMLAv2i64_indexed;
7848 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7849 FMAInstKind::Indexed, &NewVR);
7850 } else {
7851 Opc = AArch64::FMLAv2f64;
7852 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
7853 FMAInstKind::Accumulator, &NewVR);
7854 }
7855 break;
7856 }
7857 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7858 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7859 unsigned IdxDupOp =
7860 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7861 : 2;
7862 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7863 &AArch64::FPR128RegClass, MRI);
7864 break;
7865 }
7866 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7867 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7868 unsigned IdxDupOp =
7869 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7870 : 2;
7871 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7872 &AArch64::FPR128RegClass, MRI);
7873 break;
7874 }
7875 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7876 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7877 unsigned IdxDupOp =
7878 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7879 : 2;
7880 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7881 &AArch64::FPR128_loRegClass, MRI);
7882 break;
7883 }
7884 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7885 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7886 unsigned IdxDupOp =
7887 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7888 : 2;
7889 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7890 &AArch64::FPR128RegClass, MRI);
7891 break;
7892 }
7893 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7894 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7895 unsigned IdxDupOp =
7896 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7897 : 2;
7898 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7899 &AArch64::FPR128_loRegClass, MRI);
7900 break;
7901 }
7902 case AArch64MachineCombinerPattern::FNMADD: {
7903 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7904 break;
7905 }
7906
7907 } // end switch (Pattern)
7908 // Record MUL and ADD/SUB for deletion
7909 if (MUL)
7910 DelInstrs.push_back(MUL);
7911 DelInstrs.push_back(&Root);
7912
7913 // Set the flags on the inserted instructions to be the merged flags of the
7914 // instructions that we have combined.
7915 uint32_t Flags = Root.getFlags();
7916 if (MUL)
7917 Flags = Root.mergeFlagsWith(*MUL);
7918 for (auto *MI : InsInstrs)
7919 MI->setFlags(Flags);
7920 }
7921
7922 /// Replace csincr-branch sequence by simple conditional branch
7923 ///
7924 /// Examples:
7925 /// 1. \code
7926 /// csinc w9, wzr, wzr, <condition code>
7927 /// tbnz w9, #0, 0x44
7928 /// \endcode
7929 /// to
7930 /// \code
7931 /// b.<inverted condition code>
7932 /// \endcode
7933 ///
7934 /// 2. \code
7935 /// csinc w9, wzr, wzr, <condition code>
7936 /// tbz w9, #0, 0x44
7937 /// \endcode
7938 /// to
7939 /// \code
7940 /// b.<condition code>
7941 /// \endcode
7942 ///
7943 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7944 /// compare's constant operand is power of 2.
7945 ///
7946 /// Examples:
7947 /// \code
7948 /// and w8, w8, #0x400
7949 /// cbnz w8, L1
7950 /// \endcode
7951 /// to
7952 /// \code
7953 /// tbnz w8, #10, L1
7954 /// \endcode
7955 ///
7956 /// \param MI Conditional Branch
7957 /// \return True when the simple conditional branch is generated
7958 ///
optimizeCondBranch(MachineInstr & MI) const7959 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7960 bool IsNegativeBranch = false;
7961 bool IsTestAndBranch = false;
7962 unsigned TargetBBInMI = 0;
7963 switch (MI.getOpcode()) {
7964 default:
7965 llvm_unreachable("Unknown branch instruction?");
7966 case AArch64::Bcc:
7967 return false;
7968 case AArch64::CBZW:
7969 case AArch64::CBZX:
7970 TargetBBInMI = 1;
7971 break;
7972 case AArch64::CBNZW:
7973 case AArch64::CBNZX:
7974 TargetBBInMI = 1;
7975 IsNegativeBranch = true;
7976 break;
7977 case AArch64::TBZW:
7978 case AArch64::TBZX:
7979 TargetBBInMI = 2;
7980 IsTestAndBranch = true;
7981 break;
7982 case AArch64::TBNZW:
7983 case AArch64::TBNZX:
7984 TargetBBInMI = 2;
7985 IsNegativeBranch = true;
7986 IsTestAndBranch = true;
7987 break;
7988 }
7989 // So we increment a zero register and test for bits other
7990 // than bit 0? Conservatively bail out in case the verifier
7991 // missed this case.
7992 if (IsTestAndBranch && MI.getOperand(1).getImm())
7993 return false;
7994
7995 // Find Definition.
7996 assert(MI.getParent() && "Incomplete machine instruciton\n");
7997 MachineBasicBlock *MBB = MI.getParent();
7998 MachineFunction *MF = MBB->getParent();
7999 MachineRegisterInfo *MRI = &MF->getRegInfo();
8000 Register VReg = MI.getOperand(0).getReg();
8001 if (!VReg.isVirtual())
8002 return false;
8003
8004 MachineInstr *DefMI = MRI->getVRegDef(VReg);
8005
8006 // Look through COPY instructions to find definition.
8007 while (DefMI->isCopy()) {
8008 Register CopyVReg = DefMI->getOperand(1).getReg();
8009 if (!MRI->hasOneNonDBGUse(CopyVReg))
8010 return false;
8011 if (!MRI->hasOneDef(CopyVReg))
8012 return false;
8013 DefMI = MRI->getVRegDef(CopyVReg);
8014 }
8015
8016 switch (DefMI->getOpcode()) {
8017 default:
8018 return false;
8019 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
8020 case AArch64::ANDWri:
8021 case AArch64::ANDXri: {
8022 if (IsTestAndBranch)
8023 return false;
8024 if (DefMI->getParent() != MBB)
8025 return false;
8026 if (!MRI->hasOneNonDBGUse(VReg))
8027 return false;
8028
8029 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
8030 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
8031 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
8032 if (!isPowerOf2_64(Mask))
8033 return false;
8034
8035 MachineOperand &MO = DefMI->getOperand(1);
8036 Register NewReg = MO.getReg();
8037 if (!NewReg.isVirtual())
8038 return false;
8039
8040 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8041
8042 MachineBasicBlock &RefToMBB = *MBB;
8043 MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
8044 DebugLoc DL = MI.getDebugLoc();
8045 unsigned Imm = Log2_64(Mask);
8046 unsigned Opc = (Imm < 32)
8047 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8048 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8049 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8050 .addReg(NewReg)
8051 .addImm(Imm)
8052 .addMBB(TBB);
8053 // Register lives on to the CBZ now.
8054 MO.setIsKill(false);
8055
8056 // For immediate smaller than 32, we need to use the 32-bit
8057 // variant (W) in all cases. Indeed the 64-bit variant does not
8058 // allow to encode them.
8059 // Therefore, if the input register is 64-bit, we need to take the
8060 // 32-bit sub-part.
8061 if (!Is32Bit && Imm < 32)
8062 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8063 MI.eraseFromParent();
8064 return true;
8065 }
8066 // Look for CSINC
8067 case AArch64::CSINCWr:
8068 case AArch64::CSINCXr: {
8069 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8070 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8071 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8072 DefMI->getOperand(2).getReg() == AArch64::XZR))
8073 return false;
8074
8075 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8076 true) != -1)
8077 return false;
8078
8079 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
8080 // Convert only when the condition code is not modified between
8081 // the CSINC and the branch. The CC may be used by other
8082 // instructions in between.
8083 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8084 return false;
8085 MachineBasicBlock &RefToMBB = *MBB;
8086 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
8087 DebugLoc DL = MI.getDebugLoc();
8088 if (IsNegativeBranch)
8089 CC = AArch64CC::getInvertedCondCode(CC);
8090 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8091 MI.eraseFromParent();
8092 return true;
8093 }
8094 }
8095 }
8096
8097 std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const8098 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8099 const unsigned Mask = AArch64II::MO_FRAGMENT;
8100 return std::make_pair(TF & Mask, TF & ~Mask);
8101 }
8102
8103 ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const8104 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8105 using namespace AArch64II;
8106
8107 static const std::pair<unsigned, const char *> TargetFlags[] = {
8108 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8109 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8110 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8111 {MO_HI12, "aarch64-hi12"}};
8112 return ArrayRef(TargetFlags);
8113 }
8114
8115 ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const8116 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8117 using namespace AArch64II;
8118
8119 static const std::pair<unsigned, const char *> TargetFlags[] = {
8120 {MO_COFFSTUB, "aarch64-coffstub"},
8121 {MO_GOT, "aarch64-got"},
8122 {MO_NC, "aarch64-nc"},
8123 {MO_S, "aarch64-s"},
8124 {MO_TLS, "aarch64-tls"},
8125 {MO_DLLIMPORT, "aarch64-dllimport"},
8126 {MO_PREL, "aarch64-prel"},
8127 {MO_TAGGED, "aarch64-tagged"},
8128 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8129 };
8130 return ArrayRef(TargetFlags);
8131 }
8132
8133 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
getSerializableMachineMemOperandTargetFlags() const8134 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8135 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8136 {{MOSuppressPair, "aarch64-suppress-pair"},
8137 {MOStridedAccess, "aarch64-strided-access"}};
8138 return ArrayRef(TargetFlags);
8139 }
8140
8141 /// Constants defining how certain sequences should be outlined.
8142 /// This encompasses how an outlined function should be called, and what kind of
8143 /// frame should be emitted for that outlined function.
8144 ///
8145 /// \p MachineOutlinerDefault implies that the function should be called with
8146 /// a save and restore of LR to the stack.
8147 ///
8148 /// That is,
8149 ///
8150 /// I1 Save LR OUTLINED_FUNCTION:
8151 /// I2 --> BL OUTLINED_FUNCTION I1
8152 /// I3 Restore LR I2
8153 /// I3
8154 /// RET
8155 ///
8156 /// * Call construction overhead: 3 (save + BL + restore)
8157 /// * Frame construction overhead: 1 (ret)
8158 /// * Requires stack fixups? Yes
8159 ///
8160 /// \p MachineOutlinerTailCall implies that the function is being created from
8161 /// a sequence of instructions ending in a return.
8162 ///
8163 /// That is,
8164 ///
8165 /// I1 OUTLINED_FUNCTION:
8166 /// I2 --> B OUTLINED_FUNCTION I1
8167 /// RET I2
8168 /// RET
8169 ///
8170 /// * Call construction overhead: 1 (B)
8171 /// * Frame construction overhead: 0 (Return included in sequence)
8172 /// * Requires stack fixups? No
8173 ///
8174 /// \p MachineOutlinerNoLRSave implies that the function should be called using
8175 /// a BL instruction, but doesn't require LR to be saved and restored. This
8176 /// happens when LR is known to be dead.
8177 ///
8178 /// That is,
8179 ///
8180 /// I1 OUTLINED_FUNCTION:
8181 /// I2 --> BL OUTLINED_FUNCTION I1
8182 /// I3 I2
8183 /// I3
8184 /// RET
8185 ///
8186 /// * Call construction overhead: 1 (BL)
8187 /// * Frame construction overhead: 1 (RET)
8188 /// * Requires stack fixups? No
8189 ///
8190 /// \p MachineOutlinerThunk implies that the function is being created from
8191 /// a sequence of instructions ending in a call. The outlined function is
8192 /// called with a BL instruction, and the outlined function tail-calls the
8193 /// original call destination.
8194 ///
8195 /// That is,
8196 ///
8197 /// I1 OUTLINED_FUNCTION:
8198 /// I2 --> BL OUTLINED_FUNCTION I1
8199 /// BL f I2
8200 /// B f
8201 /// * Call construction overhead: 1 (BL)
8202 /// * Frame construction overhead: 0
8203 /// * Requires stack fixups? No
8204 ///
8205 /// \p MachineOutlinerRegSave implies that the function should be called with a
8206 /// save and restore of LR to an available register. This allows us to avoid
8207 /// stack fixups. Note that this outlining variant is compatible with the
8208 /// NoLRSave case.
8209 ///
8210 /// That is,
8211 ///
8212 /// I1 Save LR OUTLINED_FUNCTION:
8213 /// I2 --> BL OUTLINED_FUNCTION I1
8214 /// I3 Restore LR I2
8215 /// I3
8216 /// RET
8217 ///
8218 /// * Call construction overhead: 3 (save + BL + restore)
8219 /// * Frame construction overhead: 1 (ret)
8220 /// * Requires stack fixups? No
8221 enum MachineOutlinerClass {
8222 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8223 MachineOutlinerTailCall, /// Only emit a branch.
8224 MachineOutlinerNoLRSave, /// Emit a call and return.
8225 MachineOutlinerThunk, /// Emit a call and tail-call.
8226 MachineOutlinerRegSave /// Same as default, but save to a register.
8227 };
8228
8229 enum MachineOutlinerMBBFlags {
8230 LRUnavailableSomewhere = 0x2,
8231 HasCalls = 0x4,
8232 UnsafeRegsDead = 0x8
8233 };
8234
8235 Register
findRegisterToSaveLRTo(outliner::Candidate & C) const8236 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8237 MachineFunction *MF = C.getMF();
8238 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8239 const AArch64RegisterInfo *ARI =
8240 static_cast<const AArch64RegisterInfo *>(&TRI);
8241 // Check if there is an available register across the sequence that we can
8242 // use.
8243 for (unsigned Reg : AArch64::GPR64RegClass) {
8244 if (!ARI->isReservedReg(*MF, Reg) &&
8245 Reg != AArch64::LR && // LR is not reserved, but don't use it.
8246 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8247 Reg != AArch64::X17 && // Ditto for X17.
8248 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8249 C.isAvailableInsideSeq(Reg, TRI))
8250 return Reg;
8251 }
8252 return Register();
8253 }
8254
8255 static bool
outliningCandidatesSigningScopeConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8256 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8257 const outliner::Candidate &b) {
8258 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8259 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8260
8261 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) &&
8262 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true);
8263 }
8264
8265 static bool
outliningCandidatesSigningKeyConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8266 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8267 const outliner::Candidate &b) {
8268 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8269 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8270
8271 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8272 }
8273
outliningCandidatesV8_3OpsConsensus(const outliner::Candidate & a,const outliner::Candidate & b)8274 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8275 const outliner::Candidate &b) {
8276 const AArch64Subtarget &SubtargetA =
8277 a.getMF()->getSubtarget<AArch64Subtarget>();
8278 const AArch64Subtarget &SubtargetB =
8279 b.getMF()->getSubtarget<AArch64Subtarget>();
8280 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8281 }
8282
8283 std::optional<outliner::OutlinedFunction>
getOutliningCandidateInfo(std::vector<outliner::Candidate> & RepeatedSequenceLocs) const8284 AArch64InstrInfo::getOutliningCandidateInfo(
8285 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8286 unsigned SequenceSize = 0;
8287 for (auto &MI : RepeatedSequenceLocs[0])
8288 SequenceSize += getInstSizeInBytes(MI);
8289
8290 unsigned NumBytesToCreateFrame = 0;
8291
8292 // We only allow outlining for functions having exactly matching return
8293 // address signing attributes, i.e., all share the same value for the
8294 // attribute "sign-return-address" and all share the same type of key they
8295 // are signed with.
8296 // Additionally we require all functions to simultaniously either support
8297 // v8.3a features or not. Otherwise an outlined function could get signed
8298 // using dedicated v8.3 instructions and a call from a function that doesn't
8299 // support v8.3 instructions would therefore be invalid.
8300 if (std::adjacent_find(
8301 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
8302 [](const outliner::Candidate &a, const outliner::Candidate &b) {
8303 // Return true if a and b are non-equal w.r.t. return address
8304 // signing or support of v8.3a features
8305 if (outliningCandidatesSigningScopeConsensus(a, b) &&
8306 outliningCandidatesSigningKeyConsensus(a, b) &&
8307 outliningCandidatesV8_3OpsConsensus(a, b)) {
8308 return false;
8309 }
8310 return true;
8311 }) != RepeatedSequenceLocs.end()) {
8312 return std::nullopt;
8313 }
8314
8315 // Since at this point all candidates agree on their return address signing
8316 // picking just one is fine. If the candidate functions potentially sign their
8317 // return addresses, the outlined function should do the same. Note that in
8318 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8319 // not certainly true that the outlined function will have to sign its return
8320 // address but this decision is made later, when the decision to outline
8321 // has already been made.
8322 // The same holds for the number of additional instructions we need: On
8323 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8324 // necessary. However, at this point we don't know if the outlined function
8325 // will have a RET instruction so we assume the worst.
8326 const TargetRegisterInfo &TRI = getRegisterInfo();
8327 // Performing a tail call may require extra checks when PAuth is enabled.
8328 // If PAuth is disabled, set it to zero for uniformity.
8329 unsigned NumBytesToCheckLRInTCEpilogue = 0;
8330 if (RepeatedSequenceLocs[0]
8331 .getMF()
8332 ->getInfo<AArch64FunctionInfo>()
8333 ->shouldSignReturnAddress(true)) {
8334 // One PAC and one AUT instructions
8335 NumBytesToCreateFrame += 8;
8336
8337 // PAuth is enabled - set extra tail call cost, if any.
8338 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(
8339 *RepeatedSequenceLocs[0].getMF());
8340 NumBytesToCheckLRInTCEpilogue =
8341 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
8342 // Checking the authenticated LR value may significantly impact
8343 // SequenceSize, so account for it for more precise results.
8344 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
8345 SequenceSize += NumBytesToCheckLRInTCEpilogue;
8346
8347 // We have to check if sp modifying instructions would get outlined.
8348 // If so we only allow outlining if sp is unchanged overall, so matching
8349 // sub and add instructions are okay to outline, all other sp modifications
8350 // are not
8351 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8352 int SPValue = 0;
8353 for (auto &MI : C) {
8354 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8355 switch (MI.getOpcode()) {
8356 case AArch64::ADDXri:
8357 case AArch64::ADDWri:
8358 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8359 assert(MI.getOperand(2).isImm() &&
8360 "Expected operand to be immediate");
8361 assert(MI.getOperand(1).isReg() &&
8362 "Expected operand to be a register");
8363 // Check if the add just increments sp. If so, we search for
8364 // matching sub instructions that decrement sp. If not, the
8365 // modification is illegal
8366 if (MI.getOperand(1).getReg() == AArch64::SP)
8367 SPValue += MI.getOperand(2).getImm();
8368 else
8369 return true;
8370 break;
8371 case AArch64::SUBXri:
8372 case AArch64::SUBWri:
8373 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8374 assert(MI.getOperand(2).isImm() &&
8375 "Expected operand to be immediate");
8376 assert(MI.getOperand(1).isReg() &&
8377 "Expected operand to be a register");
8378 // Check if the sub just decrements sp. If so, we search for
8379 // matching add instructions that increment sp. If not, the
8380 // modification is illegal
8381 if (MI.getOperand(1).getReg() == AArch64::SP)
8382 SPValue -= MI.getOperand(2).getImm();
8383 else
8384 return true;
8385 break;
8386 default:
8387 return true;
8388 }
8389 }
8390 }
8391 if (SPValue)
8392 return true;
8393 return false;
8394 };
8395 // Remove candidates with illegal stack modifying instructions
8396 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
8397
8398 // If the sequence doesn't have enough candidates left, then we're done.
8399 if (RepeatedSequenceLocs.size() < 2)
8400 return std::nullopt;
8401 }
8402
8403 // Properties about candidate MBBs that hold for all of them.
8404 unsigned FlagsSetInAll = 0xF;
8405
8406 // Compute liveness information for each candidate, and set FlagsSetInAll.
8407 for (outliner::Candidate &C : RepeatedSequenceLocs)
8408 FlagsSetInAll &= C.Flags;
8409
8410 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8411
8412 // Helper lambda which sets call information for every candidate.
8413 auto SetCandidateCallInfo =
8414 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8415 for (outliner::Candidate &C : RepeatedSequenceLocs)
8416 C.setCallInfo(CallID, NumBytesForCall);
8417 };
8418
8419 unsigned FrameID = MachineOutlinerDefault;
8420 NumBytesToCreateFrame += 4;
8421
8422 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
8423 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8424 });
8425
8426 // We check to see if CFI Instructions are present, and if they are
8427 // we find the number of CFI Instructions in the candidates.
8428 unsigned CFICount = 0;
8429 for (auto &I : RepeatedSequenceLocs[0]) {
8430 if (I.isCFIInstruction())
8431 CFICount++;
8432 }
8433
8434 // We compare the number of found CFI Instructions to the number of CFI
8435 // instructions in the parent function for each candidate. We must check this
8436 // since if we outline one of the CFI instructions in a function, we have to
8437 // outline them all for correctness. If we do not, the address offsets will be
8438 // incorrect between the two sections of the program.
8439 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8440 std::vector<MCCFIInstruction> CFIInstructions =
8441 C.getMF()->getFrameInstructions();
8442
8443 if (CFICount > 0 && CFICount != CFIInstructions.size())
8444 return std::nullopt;
8445 }
8446
8447 // Returns true if an instructions is safe to fix up, false otherwise.
8448 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8449 if (MI.isCall())
8450 return true;
8451
8452 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8453 !MI.readsRegister(AArch64::SP, &TRI))
8454 return true;
8455
8456 // Any modification of SP will break our code to save/restore LR.
8457 // FIXME: We could handle some instructions which add a constant
8458 // offset to SP, with a bit more work.
8459 if (MI.modifiesRegister(AArch64::SP, &TRI))
8460 return false;
8461
8462 // At this point, we have a stack instruction that we might need to
8463 // fix up. We'll handle it if it's a load or store.
8464 if (MI.mayLoadOrStore()) {
8465 const MachineOperand *Base; // Filled with the base operand of MI.
8466 int64_t Offset; // Filled with the offset of MI.
8467 bool OffsetIsScalable;
8468
8469 // Does it allow us to offset the base operand and is the base the
8470 // register SP?
8471 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8472 !Base->isReg() || Base->getReg() != AArch64::SP)
8473 return false;
8474
8475 // Fixe-up code below assumes bytes.
8476 if (OffsetIsScalable)
8477 return false;
8478
8479 // Find the minimum/maximum offset for this instruction and check
8480 // if fixing it up would be in range.
8481 int64_t MinOffset,
8482 MaxOffset; // Unscaled offsets for the instruction.
8483 // The scale to multiply the offsets by.
8484 TypeSize Scale(0U, false), DummyWidth(0U, false);
8485 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
8486
8487 Offset += 16; // Update the offset to what it would be if we outlined.
8488 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8489 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8490 return false;
8491
8492 // It's in range, so we can outline it.
8493 return true;
8494 }
8495
8496 // FIXME: Add handling for instructions like "add x0, sp, #8".
8497
8498 // We can't fix it up, so don't outline it.
8499 return false;
8500 };
8501
8502 // True if it's possible to fix up each stack instruction in this sequence.
8503 // Important for frames/call variants that modify the stack.
8504 bool AllStackInstrsSafe =
8505 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup);
8506
8507 // If the last instruction in any candidate is a terminator, then we should
8508 // tail call all of the candidates.
8509 if (RepeatedSequenceLocs[0].back().isTerminator()) {
8510 FrameID = MachineOutlinerTailCall;
8511 NumBytesToCreateFrame = 0;
8512 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8513 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8514 }
8515
8516 else if (LastInstrOpcode == AArch64::BL ||
8517 ((LastInstrOpcode == AArch64::BLR ||
8518 LastInstrOpcode == AArch64::BLRNoIP) &&
8519 !HasBTI)) {
8520 // FIXME: Do we need to check if the code after this uses the value of LR?
8521 FrameID = MachineOutlinerThunk;
8522 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8523 SetCandidateCallInfo(MachineOutlinerThunk, 4);
8524 }
8525
8526 else {
8527 // We need to decide how to emit calls + frames. We can always emit the same
8528 // frame if we don't need to save to the stack. If we have to save to the
8529 // stack, then we need a different frame.
8530 unsigned NumBytesNoStackCalls = 0;
8531 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8532
8533 // Check if we have to save LR.
8534 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8535 bool LRAvailable =
8536 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8537 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8538 : true;
8539 // If we have a noreturn caller, then we're going to be conservative and
8540 // say that we have to save LR. If we don't have a ret at the end of the
8541 // block, then we can't reason about liveness accurately.
8542 //
8543 // FIXME: We can probably do better than always disabling this in
8544 // noreturn functions by fixing up the liveness info.
8545 bool IsNoReturn =
8546 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8547
8548 // Is LR available? If so, we don't need a save.
8549 if (LRAvailable && !IsNoReturn) {
8550 NumBytesNoStackCalls += 4;
8551 C.setCallInfo(MachineOutlinerNoLRSave, 4);
8552 CandidatesWithoutStackFixups.push_back(C);
8553 }
8554
8555 // Is an unused register available? If so, we won't modify the stack, so
8556 // we can outline with the same frame type as those that don't save LR.
8557 else if (findRegisterToSaveLRTo(C)) {
8558 NumBytesNoStackCalls += 12;
8559 C.setCallInfo(MachineOutlinerRegSave, 12);
8560 CandidatesWithoutStackFixups.push_back(C);
8561 }
8562
8563 // Is SP used in the sequence at all? If not, we don't have to modify
8564 // the stack, so we are guaranteed to get the same frame.
8565 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8566 NumBytesNoStackCalls += 12;
8567 C.setCallInfo(MachineOutlinerDefault, 12);
8568 CandidatesWithoutStackFixups.push_back(C);
8569 }
8570
8571 // If we outline this, we need to modify the stack. Pretend we don't
8572 // outline this by saving all of its bytes.
8573 else {
8574 NumBytesNoStackCalls += SequenceSize;
8575 }
8576 }
8577
8578 // If there are no places where we have to save LR, then note that we
8579 // don't have to update the stack. Otherwise, give every candidate the
8580 // default call type, as long as it's safe to do so.
8581 if (!AllStackInstrsSafe ||
8582 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8583 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8584 FrameID = MachineOutlinerNoLRSave;
8585 if (RepeatedSequenceLocs.size() < 2)
8586 return std::nullopt;
8587 } else {
8588 SetCandidateCallInfo(MachineOutlinerDefault, 12);
8589
8590 // Bugzilla ID: 46767
8591 // TODO: Check if fixing up the stack more than once is safe so we can
8592 // outline these.
8593 //
8594 // An outline resulting in a caller that requires stack fixups at the
8595 // callsite to a callee that also requires stack fixups can happen when
8596 // there are no available registers at the candidate callsite for a
8597 // candidate that itself also has calls.
8598 //
8599 // In other words if function_containing_sequence in the following pseudo
8600 // assembly requires that we save LR at the point of the call, but there
8601 // are no available registers: in this case we save using SP and as a
8602 // result the SP offsets requires stack fixups by multiples of 16.
8603 //
8604 // function_containing_sequence:
8605 // ...
8606 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8607 // call OUTLINED_FUNCTION_N
8608 // restore LR from SP
8609 // ...
8610 //
8611 // OUTLINED_FUNCTION_N:
8612 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8613 // ...
8614 // bl foo
8615 // restore LR from SP
8616 // ret
8617 //
8618 // Because the code to handle more than one stack fixup does not
8619 // currently have the proper checks for legality, these cases will assert
8620 // in the AArch64 MachineOutliner. This is because the code to do this
8621 // needs more hardening, testing, better checks that generated code is
8622 // legal, etc and because it is only verified to handle a single pass of
8623 // stack fixup.
8624 //
8625 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8626 // these cases until they are known to be handled. Bugzilla 46767 is
8627 // referenced in comments at the assert site.
8628 //
8629 // To avoid asserting (or generating non-legal code on noassert builds)
8630 // we remove all candidates which would need more than one stack fixup by
8631 // pruning the cases where the candidate has calls while also having no
8632 // available LR and having no available general purpose registers to copy
8633 // LR to (ie one extra stack save/restore).
8634 //
8635 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8636 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
8637 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8638 return (llvm::any_of(C, IsCall)) &&
8639 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8640 !findRegisterToSaveLRTo(C));
8641 });
8642 }
8643 }
8644
8645 // If we dropped all of the candidates, bail out here.
8646 if (RepeatedSequenceLocs.size() < 2) {
8647 RepeatedSequenceLocs.clear();
8648 return std::nullopt;
8649 }
8650 }
8651
8652 // Does every candidate's MBB contain a call? If so, then we might have a call
8653 // in the range.
8654 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8655 // Check if the range contains a call. These require a save + restore of the
8656 // link register.
8657 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8658 bool ModStackToSaveLR = false;
8659 if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
8660 [](const MachineInstr &MI) { return MI.isCall(); }))
8661 ModStackToSaveLR = true;
8662
8663 // Handle the last instruction separately. If this is a tail call, then the
8664 // last instruction is a call. We don't want to save + restore in this case.
8665 // However, it could be possible that the last instruction is a call without
8666 // it being valid to tail call this sequence. We should consider this as
8667 // well.
8668 else if (FrameID != MachineOutlinerThunk &&
8669 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8670 ModStackToSaveLR = true;
8671
8672 if (ModStackToSaveLR) {
8673 // We can't fix up the stack. Bail out.
8674 if (!AllStackInstrsSafe) {
8675 RepeatedSequenceLocs.clear();
8676 return std::nullopt;
8677 }
8678
8679 // Save + restore LR.
8680 NumBytesToCreateFrame += 8;
8681 }
8682 }
8683
8684 // If we have CFI instructions, we can only outline if the outlined section
8685 // can be a tail call
8686 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8687 return std::nullopt;
8688
8689 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8690 NumBytesToCreateFrame, FrameID);
8691 }
8692
mergeOutliningCandidateAttributes(Function & F,std::vector<outliner::Candidate> & Candidates) const8693 void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8694 Function &F, std::vector<outliner::Candidate> &Candidates) const {
8695 // If a bunch of candidates reach this point they must agree on their return
8696 // address signing. It is therefore enough to just consider the signing
8697 // behaviour of one of them
8698 const auto &CFn = Candidates.front().getMF()->getFunction();
8699
8700 if (CFn.hasFnAttribute("ptrauth-returns"))
8701 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns"));
8702 if (CFn.hasFnAttribute("ptrauth-auth-traps"))
8703 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps"));
8704 // Since all candidates belong to the same module, just copy the
8705 // function-level attributes of an arbitrary function.
8706 if (CFn.hasFnAttribute("sign-return-address"))
8707 F.addFnAttr(CFn.getFnAttribute("sign-return-address"));
8708 if (CFn.hasFnAttribute("sign-return-address-key"))
8709 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key"));
8710
8711 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8712 }
8713
isFunctionSafeToOutlineFrom(MachineFunction & MF,bool OutlineFromLinkOnceODRs) const8714 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8715 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8716 const Function &F = MF.getFunction();
8717
8718 // Can F be deduplicated by the linker? If it can, don't outline from it.
8719 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8720 return false;
8721
8722 // Don't outline from functions with section markings; the program could
8723 // expect that all the code is in the named section.
8724 // FIXME: Allow outlining from multiple functions with the same section
8725 // marking.
8726 if (F.hasSection())
8727 return false;
8728
8729 // Outlining from functions with redzones is unsafe since the outliner may
8730 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8731 // outline from it.
8732 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8733 if (!AFI || AFI->hasRedZone().value_or(true))
8734 return false;
8735
8736 // FIXME: Determine whether it is safe to outline from functions which contain
8737 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
8738 // outlined together and ensure it is safe to outline with async unwind info,
8739 // required for saving & restoring VG around calls.
8740 if (AFI->hasStreamingModeChanges())
8741 return false;
8742
8743 // FIXME: Teach the outliner to generate/handle Windows unwind info.
8744 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8745 return false;
8746
8747 // It's safe to outline from MF.
8748 return true;
8749 }
8750
8751 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
getOutlinableRanges(MachineBasicBlock & MBB,unsigned & Flags) const8752 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8753 unsigned &Flags) const {
8754 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8755 "Must track liveness!");
8756 SmallVector<
8757 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8758 Ranges;
8759 // According to the AArch64 Procedure Call Standard, the following are
8760 // undefined on entry/exit from a function call:
8761 //
8762 // * Registers x16, x17, (and thus w16, w17)
8763 // * Condition codes (and thus the NZCV register)
8764 //
8765 // If any of these registers are used inside or live across an outlined
8766 // function, then they may be modified later, either by the compiler or
8767 // some other tool (like the linker).
8768 //
8769 // To avoid outlining in these situations, partition each block into ranges
8770 // where these registers are dead. We will only outline from those ranges.
8771 LiveRegUnits LRU(getRegisterInfo());
8772 auto AreAllUnsafeRegsDead = [&LRU]() {
8773 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8774 LRU.available(AArch64::NZCV);
8775 };
8776
8777 // We need to know if LR is live across an outlining boundary later on in
8778 // order to decide how we'll create the outlined call, frame, etc.
8779 //
8780 // It's pretty expensive to check this for *every candidate* within a block.
8781 // That's some potentially n^2 behaviour, since in the worst case, we'd need
8782 // to compute liveness from the end of the block for O(n) candidates within
8783 // the block.
8784 //
8785 // So, to improve the average case, let's keep track of liveness from the end
8786 // of the block to the beginning of *every outlinable range*. If we know that
8787 // LR is available in every range we could outline from, then we know that
8788 // we don't need to check liveness for any candidate within that range.
8789 bool LRAvailableEverywhere = true;
8790 // Compute liveness bottom-up.
8791 LRU.addLiveOuts(MBB);
8792 // Update flags that require info about the entire MBB.
8793 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8794 if (MI.isCall() && !MI.isTerminator())
8795 Flags |= MachineOutlinerMBBFlags::HasCalls;
8796 };
8797 // Range: [RangeBegin, RangeEnd)
8798 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8799 unsigned RangeLen;
8800 auto CreateNewRangeStartingAt =
8801 [&RangeBegin, &RangeEnd,
8802 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8803 RangeBegin = NewBegin;
8804 RangeEnd = std::next(RangeBegin);
8805 RangeLen = 0;
8806 };
8807 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8808 // At least one unsafe register is not dead. We do not want to outline at
8809 // this point. If it is long enough to outline from, save the range
8810 // [RangeBegin, RangeEnd).
8811 if (RangeLen > 1)
8812 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd));
8813 };
8814 // Find the first point where all unsafe registers are dead.
8815 // FIND: <safe instr> <-- end of first potential range
8816 // SKIP: <unsafe def>
8817 // SKIP: ... everything between ...
8818 // SKIP: <unsafe use>
8819 auto FirstPossibleEndPt = MBB.instr_rbegin();
8820 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8821 LRU.stepBackward(*FirstPossibleEndPt);
8822 // Update flags that impact how we outline across the entire block,
8823 // regardless of safety.
8824 UpdateWholeMBBFlags(*FirstPossibleEndPt);
8825 if (AreAllUnsafeRegsDead())
8826 break;
8827 }
8828 // If we exhausted the entire block, we have no safe ranges to outline.
8829 if (FirstPossibleEndPt == MBB.instr_rend())
8830 return Ranges;
8831 // Current range.
8832 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8833 // StartPt points to the first place where all unsafe registers
8834 // are dead (if there is any such point). Begin partitioning the MBB into
8835 // ranges.
8836 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
8837 LRU.stepBackward(MI);
8838 UpdateWholeMBBFlags(MI);
8839 if (!AreAllUnsafeRegsDead()) {
8840 SaveRangeIfNonEmpty();
8841 CreateNewRangeStartingAt(MI.getIterator());
8842 continue;
8843 }
8844 LRAvailableEverywhere &= LRU.available(AArch64::LR);
8845 RangeBegin = MI.getIterator();
8846 ++RangeLen;
8847 }
8848 // Above loop misses the last (or only) range. If we are still safe, then
8849 // let's save the range.
8850 if (AreAllUnsafeRegsDead())
8851 SaveRangeIfNonEmpty();
8852 if (Ranges.empty())
8853 return Ranges;
8854 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8855 // the order.
8856 std::reverse(Ranges.begin(), Ranges.end());
8857 // If there is at least one outlinable range where LR is unavailable
8858 // somewhere, remember that.
8859 if (!LRAvailableEverywhere)
8860 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8861 return Ranges;
8862 }
8863
8864 outliner::InstrType
getOutliningTypeImpl(MachineBasicBlock::iterator & MIT,unsigned Flags) const8865 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8866 unsigned Flags) const {
8867 MachineInstr &MI = *MIT;
8868 MachineBasicBlock *MBB = MI.getParent();
8869 MachineFunction *MF = MBB->getParent();
8870 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8871
8872 // Don't outline anything used for return address signing. The outlined
8873 // function will get signed later if needed
8874 switch (MI.getOpcode()) {
8875 case AArch64::PACM:
8876 case AArch64::PACIASP:
8877 case AArch64::PACIBSP:
8878 case AArch64::PACIASPPC:
8879 case AArch64::PACIBSPPC:
8880 case AArch64::AUTIASP:
8881 case AArch64::AUTIBSP:
8882 case AArch64::AUTIASPPCi:
8883 case AArch64::AUTIASPPCr:
8884 case AArch64::AUTIBSPPCi:
8885 case AArch64::AUTIBSPPCr:
8886 case AArch64::RETAA:
8887 case AArch64::RETAB:
8888 case AArch64::RETAASPPCi:
8889 case AArch64::RETAASPPCr:
8890 case AArch64::RETABSPPCi:
8891 case AArch64::RETABSPPCr:
8892 case AArch64::EMITBKEY:
8893 case AArch64::PAUTH_PROLOGUE:
8894 case AArch64::PAUTH_EPILOGUE:
8895 return outliner::InstrType::Illegal;
8896 }
8897
8898 // Don't outline LOHs.
8899 if (FuncInfo->getLOHRelated().count(&MI))
8900 return outliner::InstrType::Illegal;
8901
8902 // We can only outline these if we will tail call the outlined function, or
8903 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8904 // in a tail call.
8905 //
8906 // FIXME: If the proper fixups for the offset are implemented, this should be
8907 // possible.
8908 if (MI.isCFIInstruction())
8909 return outliner::InstrType::Legal;
8910
8911 // Is this a terminator for a basic block?
8912 if (MI.isTerminator())
8913 // TargetInstrInfo::getOutliningType has already filtered out anything
8914 // that would break this, so we can allow it here.
8915 return outliner::InstrType::Legal;
8916
8917 // Make sure none of the operands are un-outlinable.
8918 for (const MachineOperand &MOP : MI.operands()) {
8919 // A check preventing CFI indices was here before, but only CFI
8920 // instructions should have those.
8921 assert(!MOP.isCFIIndex());
8922
8923 // If it uses LR or W30 explicitly, then don't touch it.
8924 if (MOP.isReg() && !MOP.isImplicit() &&
8925 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8926 return outliner::InstrType::Illegal;
8927 }
8928
8929 // Special cases for instructions that can always be outlined, but will fail
8930 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8931 // be outlined because they don't require a *specific* value to be in LR.
8932 if (MI.getOpcode() == AArch64::ADRP)
8933 return outliner::InstrType::Legal;
8934
8935 // If MI is a call we might be able to outline it. We don't want to outline
8936 // any calls that rely on the position of items on the stack. When we outline
8937 // something containing a call, we have to emit a save and restore of LR in
8938 // the outlined function. Currently, this always happens by saving LR to the
8939 // stack. Thus, if we outline, say, half the parameters for a function call
8940 // plus the call, then we'll break the callee's expectations for the layout
8941 // of the stack.
8942 //
8943 // FIXME: Allow calls to functions which construct a stack frame, as long
8944 // as they don't access arguments on the stack.
8945 // FIXME: Figure out some way to analyze functions defined in other modules.
8946 // We should be able to compute the memory usage based on the IR calling
8947 // convention, even if we can't see the definition.
8948 if (MI.isCall()) {
8949 // Get the function associated with the call. Look at each operand and find
8950 // the one that represents the callee and get its name.
8951 const Function *Callee = nullptr;
8952 for (const MachineOperand &MOP : MI.operands()) {
8953 if (MOP.isGlobal()) {
8954 Callee = dyn_cast<Function>(MOP.getGlobal());
8955 break;
8956 }
8957 }
8958
8959 // Never outline calls to mcount. There isn't any rule that would require
8960 // this, but the Linux kernel's "ftrace" feature depends on it.
8961 if (Callee && Callee->getName() == "\01_mcount")
8962 return outliner::InstrType::Illegal;
8963
8964 // If we don't know anything about the callee, assume it depends on the
8965 // stack layout of the caller. In that case, it's only legal to outline
8966 // as a tail-call. Explicitly list the call instructions we know about so we
8967 // don't get unexpected results with call pseudo-instructions.
8968 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8969 if (MI.getOpcode() == AArch64::BLR ||
8970 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8971 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8972
8973 if (!Callee)
8974 return UnknownCallOutlineType;
8975
8976 // We have a function we have information about. Check it if it's something
8977 // can safely outline.
8978 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee);
8979
8980 // We don't know what's going on with the callee at all. Don't touch it.
8981 if (!CalleeMF)
8982 return UnknownCallOutlineType;
8983
8984 // Check if we know anything about the callee saves on the function. If we
8985 // don't, then don't touch it, since that implies that we haven't
8986 // computed anything about its stack frame yet.
8987 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8988 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8989 MFI.getNumObjects() > 0)
8990 return UnknownCallOutlineType;
8991
8992 // At this point, we can say that CalleeMF ought to not pass anything on the
8993 // stack. Therefore, we can outline it.
8994 return outliner::InstrType::Legal;
8995 }
8996
8997 // Don't touch the link register or W30.
8998 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8999 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
9000 return outliner::InstrType::Illegal;
9001
9002 // Don't outline BTI instructions, because that will prevent the outlining
9003 // site from being indirectly callable.
9004 if (hasBTISemantics(MI))
9005 return outliner::InstrType::Illegal;
9006
9007 return outliner::InstrType::Legal;
9008 }
9009
fixupPostOutline(MachineBasicBlock & MBB) const9010 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
9011 for (MachineInstr &MI : MBB) {
9012 const MachineOperand *Base;
9013 TypeSize Width(0, false);
9014 int64_t Offset;
9015 bool OffsetIsScalable;
9016
9017 // Is this a load or store with an immediate offset with SP as the base?
9018 if (!MI.mayLoadOrStore() ||
9019 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
9020 &RI) ||
9021 (Base->isReg() && Base->getReg() != AArch64::SP))
9022 continue;
9023
9024 // It is, so we have to fix it up.
9025 TypeSize Scale(0U, false);
9026 int64_t Dummy1, Dummy2;
9027
9028 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
9029 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
9030 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
9031 assert(Scale != 0 && "Unexpected opcode!");
9032 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
9033
9034 // We've pushed the return address to the stack, so add 16 to the offset.
9035 // This is safe, since we already checked if it would overflow when we
9036 // checked if this instruction was legal to outline.
9037 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
9038 StackOffsetOperand.setImm(NewImm);
9039 }
9040 }
9041
signOutlinedFunction(MachineFunction & MF,MachineBasicBlock & MBB,const AArch64InstrInfo * TII,bool ShouldSignReturnAddr)9042 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
9043 const AArch64InstrInfo *TII,
9044 bool ShouldSignReturnAddr) {
9045 if (!ShouldSignReturnAddr)
9046 return;
9047
9048 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
9049 .setMIFlag(MachineInstr::FrameSetup);
9050 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9051 TII->get(AArch64::PAUTH_EPILOGUE))
9052 .setMIFlag(MachineInstr::FrameDestroy);
9053 }
9054
buildOutlinedFrame(MachineBasicBlock & MBB,MachineFunction & MF,const outliner::OutlinedFunction & OF) const9055 void AArch64InstrInfo::buildOutlinedFrame(
9056 MachineBasicBlock &MBB, MachineFunction &MF,
9057 const outliner::OutlinedFunction &OF) const {
9058
9059 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9060
9061 if (OF.FrameConstructionID == MachineOutlinerTailCall)
9062 FI->setOutliningStyle("Tail Call");
9063 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9064 // For thunk outlining, rewrite the last instruction from a call to a
9065 // tail-call.
9066 MachineInstr *Call = &*--MBB.instr_end();
9067 unsigned TailOpcode;
9068 if (Call->getOpcode() == AArch64::BL) {
9069 TailOpcode = AArch64::TCRETURNdi;
9070 } else {
9071 assert(Call->getOpcode() == AArch64::BLR ||
9072 Call->getOpcode() == AArch64::BLRNoIP);
9073 TailOpcode = AArch64::TCRETURNriALL;
9074 }
9075 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9076 .add(Call->getOperand(0))
9077 .addImm(0);
9078 MBB.insert(MBB.end(), TC);
9079 Call->eraseFromParent();
9080
9081 FI->setOutliningStyle("Thunk");
9082 }
9083
9084 bool IsLeafFunction = true;
9085
9086 // Is there a call in the outlined range?
9087 auto IsNonTailCall = [](const MachineInstr &MI) {
9088 return MI.isCall() && !MI.isReturn();
9089 };
9090
9091 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) {
9092 // Fix up the instructions in the range, since we're going to modify the
9093 // stack.
9094
9095 // Bugzilla ID: 46767
9096 // TODO: Check if fixing up twice is safe so we can outline these.
9097 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9098 "Can only fix up stack references once");
9099 fixupPostOutline(MBB);
9100
9101 IsLeafFunction = false;
9102
9103 // LR has to be a live in so that we can save it.
9104 if (!MBB.isLiveIn(AArch64::LR))
9105 MBB.addLiveIn(AArch64::LR);
9106
9107 MachineBasicBlock::iterator It = MBB.begin();
9108 MachineBasicBlock::iterator Et = MBB.end();
9109
9110 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9111 OF.FrameConstructionID == MachineOutlinerThunk)
9112 Et = std::prev(MBB.end());
9113
9114 // Insert a save before the outlined region
9115 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9116 .addReg(AArch64::SP, RegState::Define)
9117 .addReg(AArch64::LR)
9118 .addReg(AArch64::SP)
9119 .addImm(-16);
9120 It = MBB.insert(It, STRXpre);
9121
9122 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9123 const TargetSubtargetInfo &STI = MF.getSubtarget();
9124 const MCRegisterInfo *MRI = STI.getRegisterInfo();
9125 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9126
9127 // Add a CFI saying the stack was moved 16 B down.
9128 int64_t StackPosEntry =
9129 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
9130 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9131 .addCFIIndex(StackPosEntry)
9132 .setMIFlags(MachineInstr::FrameSetup);
9133
9134 // Add a CFI saying that the LR that we want to find is now 16 B higher
9135 // than before.
9136 int64_t LRPosEntry = MF.addFrameInst(
9137 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
9138 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9139 .addCFIIndex(LRPosEntry)
9140 .setMIFlags(MachineInstr::FrameSetup);
9141 }
9142
9143 // Insert a restore before the terminator for the function.
9144 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9145 .addReg(AArch64::SP, RegState::Define)
9146 .addReg(AArch64::LR, RegState::Define)
9147 .addReg(AArch64::SP)
9148 .addImm(16);
9149 Et = MBB.insert(Et, LDRXpost);
9150 }
9151
9152 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction);
9153
9154 // If this is a tail call outlined function, then there's already a return.
9155 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9156 OF.FrameConstructionID == MachineOutlinerThunk) {
9157 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9158 return;
9159 }
9160
9161 // It's not a tail call, so we have to insert the return ourselves.
9162
9163 // LR has to be a live in so that we can return to it.
9164 if (!MBB.isLiveIn(AArch64::LR))
9165 MBB.addLiveIn(AArch64::LR);
9166
9167 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9168 .addReg(AArch64::LR);
9169 MBB.insert(MBB.end(), ret);
9170
9171 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr);
9172
9173 FI->setOutliningStyle("Function");
9174
9175 // Did we have to modify the stack by saving the link register?
9176 if (OF.FrameConstructionID != MachineOutlinerDefault)
9177 return;
9178
9179 // We modified the stack.
9180 // Walk over the basic block and fix up all the stack accesses.
9181 fixupPostOutline(MBB);
9182 }
9183
insertOutlinedCall(Module & M,MachineBasicBlock & MBB,MachineBasicBlock::iterator & It,MachineFunction & MF,outliner::Candidate & C) const9184 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9185 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9186 MachineFunction &MF, outliner::Candidate &C) const {
9187
9188 // Are we tail calling?
9189 if (C.CallConstructionID == MachineOutlinerTailCall) {
9190 // If yes, then we can just branch to the label.
9191 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9192 .addGlobalAddress(M.getNamedValue(MF.getName()))
9193 .addImm(0));
9194 return It;
9195 }
9196
9197 // Are we saving the link register?
9198 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9199 C.CallConstructionID == MachineOutlinerThunk) {
9200 // No, so just insert the call.
9201 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9202 .addGlobalAddress(M.getNamedValue(MF.getName())));
9203 return It;
9204 }
9205
9206 // We want to return the spot where we inserted the call.
9207 MachineBasicBlock::iterator CallPt;
9208
9209 // Instructions for saving and restoring LR around the call instruction we're
9210 // going to insert.
9211 MachineInstr *Save;
9212 MachineInstr *Restore;
9213 // Can we save to a register?
9214 if (C.CallConstructionID == MachineOutlinerRegSave) {
9215 // FIXME: This logic should be sunk into a target-specific interface so that
9216 // we don't have to recompute the register.
9217 Register Reg = findRegisterToSaveLRTo(C);
9218 assert(Reg && "No callee-saved register available?");
9219
9220 // LR has to be a live in so that we can save it.
9221 if (!MBB.isLiveIn(AArch64::LR))
9222 MBB.addLiveIn(AArch64::LR);
9223
9224 // Save and restore LR from Reg.
9225 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9226 .addReg(AArch64::XZR)
9227 .addReg(AArch64::LR)
9228 .addImm(0);
9229 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9230 .addReg(AArch64::XZR)
9231 .addReg(Reg)
9232 .addImm(0);
9233 } else {
9234 // We have the default case. Save and restore from SP.
9235 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9236 .addReg(AArch64::SP, RegState::Define)
9237 .addReg(AArch64::LR)
9238 .addReg(AArch64::SP)
9239 .addImm(-16);
9240 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9241 .addReg(AArch64::SP, RegState::Define)
9242 .addReg(AArch64::LR, RegState::Define)
9243 .addReg(AArch64::SP)
9244 .addImm(16);
9245 }
9246
9247 It = MBB.insert(It, Save);
9248 It++;
9249
9250 // Insert the call.
9251 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9252 .addGlobalAddress(M.getNamedValue(MF.getName())));
9253 CallPt = It;
9254 It++;
9255
9256 It = MBB.insert(It, Restore);
9257 return CallPt;
9258 }
9259
shouldOutlineFromFunctionByDefault(MachineFunction & MF) const9260 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9261 MachineFunction &MF) const {
9262 return MF.getFunction().hasMinSize();
9263 }
9264
buildClearRegister(Register Reg,MachineBasicBlock & MBB,MachineBasicBlock::iterator Iter,DebugLoc & DL,bool AllowSideEffects) const9265 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9266 MachineBasicBlock::iterator Iter,
9267 DebugLoc &DL,
9268 bool AllowSideEffects) const {
9269 const MachineFunction &MF = *MBB.getParent();
9270 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9271 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9272
9273 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9274 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9275 } else if (STI.hasSVE()) {
9276 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9277 .addImm(0)
9278 .addImm(0);
9279 } else {
9280 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9281 .addImm(0);
9282 }
9283 }
9284
9285 std::optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr & MI) const9286 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9287
9288 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9289 // and zero immediate operands used as an alias for mov instruction.
9290 if (MI.getOpcode() == AArch64::ORRWrs &&
9291 MI.getOperand(1).getReg() == AArch64::WZR &&
9292 MI.getOperand(3).getImm() == 0x0 &&
9293 // Check that the w->w move is not a zero-extending w->x mov.
9294 (!MI.getOperand(0).getReg().isVirtual() ||
9295 MI.getOperand(0).getSubReg() == 0) &&
9296 (!MI.getOperand(0).getReg().isPhysical() ||
9297 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9298 AArch64::X0,
9299 /*TRI=*/nullptr) == -1))
9300 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9301
9302 if (MI.getOpcode() == AArch64::ORRXrs &&
9303 MI.getOperand(1).getReg() == AArch64::XZR &&
9304 MI.getOperand(3).getImm() == 0x0)
9305 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9306
9307 return std::nullopt;
9308 }
9309
9310 std::optional<DestSourcePair>
isCopyLikeInstrImpl(const MachineInstr & MI) const9311 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9312 if (MI.getOpcode() == AArch64::ORRWrs &&
9313 MI.getOperand(1).getReg() == AArch64::WZR &&
9314 MI.getOperand(3).getImm() == 0x0)
9315 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
9316 return std::nullopt;
9317 }
9318
9319 std::optional<RegImmPair>
isAddImmediate(const MachineInstr & MI,Register Reg) const9320 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9321 int Sign = 1;
9322 int64_t Offset = 0;
9323
9324 // TODO: Handle cases where Reg is a super- or sub-register of the
9325 // destination register.
9326 const MachineOperand &Op0 = MI.getOperand(0);
9327 if (!Op0.isReg() || Reg != Op0.getReg())
9328 return std::nullopt;
9329
9330 switch (MI.getOpcode()) {
9331 default:
9332 return std::nullopt;
9333 case AArch64::SUBWri:
9334 case AArch64::SUBXri:
9335 case AArch64::SUBSWri:
9336 case AArch64::SUBSXri:
9337 Sign *= -1;
9338 [[fallthrough]];
9339 case AArch64::ADDSWri:
9340 case AArch64::ADDSXri:
9341 case AArch64::ADDWri:
9342 case AArch64::ADDXri: {
9343 // TODO: Third operand can be global address (usually some string).
9344 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
9345 !MI.getOperand(2).isImm())
9346 return std::nullopt;
9347 int Shift = MI.getOperand(3).getImm();
9348 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9349 Offset = Sign * (MI.getOperand(2).getImm() << Shift);
9350 }
9351 }
9352 return RegImmPair{MI.getOperand(1).getReg(), Offset};
9353 }
9354
9355 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9356 /// the destination register then, if possible, describe the value in terms of
9357 /// the source register.
9358 static std::optional<ParamLoadedValue>
describeORRLoadedValue(const MachineInstr & MI,Register DescribedReg,const TargetInstrInfo * TII,const TargetRegisterInfo * TRI)9359 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9360 const TargetInstrInfo *TII,
9361 const TargetRegisterInfo *TRI) {
9362 auto DestSrc = TII->isCopyLikeInstr(MI);
9363 if (!DestSrc)
9364 return std::nullopt;
9365
9366 Register DestReg = DestSrc->Destination->getReg();
9367 Register SrcReg = DestSrc->Source->getReg();
9368
9369 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9370
9371 // If the described register is the destination, just return the source.
9372 if (DestReg == DescribedReg)
9373 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9374
9375 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9376 if (MI.getOpcode() == AArch64::ORRWrs &&
9377 TRI->isSuperRegister(DestReg, DescribedReg))
9378 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9379
9380 // We may need to describe the lower part of a ORRXrs move.
9381 if (MI.getOpcode() == AArch64::ORRXrs &&
9382 TRI->isSubRegister(DestReg, DescribedReg)) {
9383 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9384 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9385 }
9386
9387 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9388 "Unhandled ORR[XW]rs copy case");
9389
9390 return std::nullopt;
9391 }
9392
isFunctionSafeToSplit(const MachineFunction & MF) const9393 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9394 // Functions cannot be split to different sections on AArch64 if they have
9395 // a red zone. This is because relaxing a cross-section branch may require
9396 // incrementing the stack pointer to spill a register, which would overwrite
9397 // the red zone.
9398 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true))
9399 return false;
9400
9401 return TargetInstrInfo::isFunctionSafeToSplit(MF);
9402 }
9403
isMBBSafeToSplitToCold(const MachineBasicBlock & MBB) const9404 bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9405 const MachineBasicBlock &MBB) const {
9406 // Asm Goto blocks can contain conditional branches to goto labels, which can
9407 // get moved out of range of the branch instruction.
9408 auto isAsmGoto = [](const MachineInstr &MI) {
9409 return MI.getOpcode() == AArch64::INLINEASM_BR;
9410 };
9411 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9412 return false;
9413
9414 // Because jump tables are label-relative instead of table-relative, they all
9415 // must be in the same section or relocation fixup handling will fail.
9416
9417 // Check if MBB is a jump table target
9418 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9419 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9420 return llvm::is_contained(JTE.MBBs, &MBB);
9421 };
9422 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB))
9423 return false;
9424
9425 // Check if MBB contains a jump table lookup
9426 for (const MachineInstr &MI : MBB) {
9427 switch (MI.getOpcode()) {
9428 case TargetOpcode::G_BRJT:
9429 case AArch64::JumpTableDest32:
9430 case AArch64::JumpTableDest16:
9431 case AArch64::JumpTableDest8:
9432 return false;
9433 default:
9434 continue;
9435 }
9436 }
9437
9438 // MBB isn't a special case, so it's safe to be split to the cold section.
9439 return true;
9440 }
9441
9442 std::optional<ParamLoadedValue>
describeLoadedValue(const MachineInstr & MI,Register Reg) const9443 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9444 Register Reg) const {
9445 const MachineFunction *MF = MI.getMF();
9446 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9447 switch (MI.getOpcode()) {
9448 case AArch64::MOVZWi:
9449 case AArch64::MOVZXi: {
9450 // MOVZWi may be used for producing zero-extended 32-bit immediates in
9451 // 64-bit parameters, so we need to consider super-registers.
9452 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9453 return std::nullopt;
9454
9455 if (!MI.getOperand(1).isImm())
9456 return std::nullopt;
9457 int64_t Immediate = MI.getOperand(1).getImm();
9458 int Shift = MI.getOperand(2).getImm();
9459 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
9460 nullptr);
9461 }
9462 case AArch64::ORRWrs:
9463 case AArch64::ORRXrs:
9464 return describeORRLoadedValue(MI, Reg, this, TRI);
9465 }
9466
9467 return TargetInstrInfo::describeLoadedValue(MI, Reg);
9468 }
9469
isExtendLikelyToBeFolded(MachineInstr & ExtMI,MachineRegisterInfo & MRI) const9470 bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9471 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9472 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9473 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9474 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9475
9476 // Anyexts are nops.
9477 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9478 return true;
9479
9480 Register DefReg = ExtMI.getOperand(0).getReg();
9481 if (!MRI.hasOneNonDBGUse(DefReg))
9482 return false;
9483
9484 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9485 // addressing mode.
9486 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg);
9487 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9488 }
9489
getElementSizeForOpcode(unsigned Opc) const9490 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9491 return get(Opc).TSFlags & AArch64::ElementSizeMask;
9492 }
9493
isPTestLikeOpcode(unsigned Opc) const9494 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9495 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9496 }
9497
isWhileOpcode(unsigned Opc) const9498 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9499 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9500 }
9501
9502 unsigned int
getTailDuplicateSize(CodeGenOptLevel OptLevel) const9503 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9504 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9505 }
9506
isLegalAddressingMode(unsigned NumBytes,int64_t Offset,unsigned Scale) const9507 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9508 unsigned Scale) const {
9509 if (Offset && Scale)
9510 return false;
9511
9512 // Check Reg + Imm
9513 if (!Scale) {
9514 // 9-bit signed offset
9515 if (isInt<9>(Offset))
9516 return true;
9517
9518 // 12-bit unsigned offset
9519 unsigned Shift = Log2_64(NumBytes);
9520 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9521 // Must be a multiple of NumBytes (NumBytes is a power of 2)
9522 (Offset >> Shift) << Shift == Offset)
9523 return true;
9524 return false;
9525 }
9526
9527 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9528 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9529 }
9530
getBLRCallOpcode(const MachineFunction & MF)9531 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9532 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9533 return AArch64::BLRNoIP;
9534 else
9535 return AArch64::BLR;
9536 }
9537
9538 MachineBasicBlock::iterator
probedStackAlloc(MachineBasicBlock::iterator MBBI,Register TargetReg,bool FrameSetup) const9539 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9540 Register TargetReg, bool FrameSetup) const {
9541 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9542
9543 MachineBasicBlock &MBB = *MBBI->getParent();
9544 MachineFunction &MF = *MBB.getParent();
9545 const AArch64InstrInfo *TII =
9546 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9547 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9548 DebugLoc DL = MBB.findDebugLoc(MBBI);
9549
9550 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
9551 MachineBasicBlock *LoopTestMBB =
9552 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9553 MF.insert(MBBInsertPoint, LoopTestMBB);
9554 MachineBasicBlock *LoopBodyMBB =
9555 MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9556 MF.insert(MBBInsertPoint, LoopBodyMBB);
9557 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
9558 MF.insert(MBBInsertPoint, ExitMBB);
9559 MachineInstr::MIFlag Flags =
9560 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9561
9562 // LoopTest:
9563 // SUB SP, SP, #ProbeSize
9564 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9565 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9566
9567 // CMP SP, TargetReg
9568 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9569 AArch64::XZR)
9570 .addReg(AArch64::SP)
9571 .addReg(TargetReg)
9572 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9573 .setMIFlags(Flags);
9574
9575 // B.<Cond> LoopExit
9576 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9577 .addImm(AArch64CC::LE)
9578 .addMBB(ExitMBB)
9579 .setMIFlags(Flags);
9580
9581 // STR XZR, [SP]
9582 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9583 .addReg(AArch64::XZR)
9584 .addReg(AArch64::SP)
9585 .addImm(0)
9586 .setMIFlags(Flags);
9587
9588 // B loop
9589 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9590 .addMBB(LoopTestMBB)
9591 .setMIFlags(Flags);
9592
9593 // LoopExit:
9594 // MOV SP, TargetReg
9595 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9596 .addReg(TargetReg)
9597 .addImm(0)
9598 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9599 .setMIFlags(Flags);
9600
9601 // LDR XZR, [SP]
9602 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9603 .addReg(AArch64::XZR, RegState::Define)
9604 .addReg(AArch64::SP)
9605 .addImm(0)
9606 .setMIFlags(Flags);
9607
9608 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
9609 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
9610
9611 LoopTestMBB->addSuccessor(ExitMBB);
9612 LoopTestMBB->addSuccessor(LoopBodyMBB);
9613 LoopBodyMBB->addSuccessor(LoopTestMBB);
9614 MBB.addSuccessor(LoopTestMBB);
9615
9616 // Update liveins.
9617 if (MF.getRegInfo().reservedRegsFrozen())
9618 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
9619
9620 return ExitMBB->begin();
9621 }
9622
9623 namespace {
9624 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9625 MachineFunction *MF;
9626 const TargetInstrInfo *TII;
9627 const TargetRegisterInfo *TRI;
9628 MachineRegisterInfo &MRI;
9629
9630 /// The block of the loop
9631 MachineBasicBlock *LoopBB;
9632 /// The conditional branch of the loop
9633 MachineInstr *CondBranch;
9634 /// The compare instruction for loop control
9635 MachineInstr *Comp;
9636 /// The number of the operand of the loop counter value in Comp
9637 unsigned CompCounterOprNum;
9638 /// The instruction that updates the loop counter value
9639 MachineInstr *Update;
9640 /// The number of the operand of the loop counter value in Update
9641 unsigned UpdateCounterOprNum;
9642 /// The initial value of the loop counter
9643 Register Init;
9644 /// True iff Update is a predecessor of Comp
9645 bool IsUpdatePriorComp;
9646
9647 /// The normalized condition used by createTripCountGreaterCondition()
9648 SmallVector<MachineOperand, 4> Cond;
9649
9650 public:
AArch64PipelinerLoopInfo(MachineBasicBlock * LoopBB,MachineInstr * CondBranch,MachineInstr * Comp,unsigned CompCounterOprNum,MachineInstr * Update,unsigned UpdateCounterOprNum,Register Init,bool IsUpdatePriorComp,const SmallVectorImpl<MachineOperand> & Cond)9651 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch,
9652 MachineInstr *Comp, unsigned CompCounterOprNum,
9653 MachineInstr *Update, unsigned UpdateCounterOprNum,
9654 Register Init, bool IsUpdatePriorComp,
9655 const SmallVectorImpl<MachineOperand> &Cond)
9656 : MF(Comp->getParent()->getParent()),
9657 TII(MF->getSubtarget().getInstrInfo()),
9658 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()),
9659 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp),
9660 CompCounterOprNum(CompCounterOprNum), Update(Update),
9661 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init),
9662 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {}
9663
shouldIgnoreForPipelining(const MachineInstr * MI) const9664 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9665 // Make the instructions for loop control be placed in stage 0.
9666 // The predecessors of Comp are considered by the caller.
9667 return MI == Comp;
9668 }
9669
createTripCountGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & CondParam)9670 std::optional<bool> createTripCountGreaterCondition(
9671 int TC, MachineBasicBlock &MBB,
9672 SmallVectorImpl<MachineOperand> &CondParam) override {
9673 // A branch instruction will be inserted as "if (Cond) goto epilogue".
9674 // Cond is normalized for such use.
9675 // The predecessors of the branch are assumed to have already been inserted.
9676 CondParam = Cond;
9677 return {};
9678 }
9679
9680 void createRemainingIterationsGreaterCondition(
9681 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9682 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override;
9683
setPreheader(MachineBasicBlock * NewPreheader)9684 void setPreheader(MachineBasicBlock *NewPreheader) override {}
9685
adjustTripCount(int TripCountAdjust)9686 void adjustTripCount(int TripCountAdjust) override {}
9687
disposed()9688 void disposed() override {}
isMVEExpanderSupported()9689 bool isMVEExpanderSupported() override { return true; }
9690 };
9691 } // namespace
9692
9693 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand
9694 /// is replaced by ReplaceReg. The output register is newly created.
9695 /// The other operands are unchanged from MI.
cloneInstr(const MachineInstr * MI,unsigned ReplaceOprNum,Register ReplaceReg,MachineBasicBlock & MBB,MachineBasicBlock::iterator InsertTo)9696 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum,
9697 Register ReplaceReg, MachineBasicBlock &MBB,
9698 MachineBasicBlock::iterator InsertTo) {
9699 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
9700 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
9701 const TargetRegisterInfo *TRI =
9702 MBB.getParent()->getSubtarget().getRegisterInfo();
9703 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI);
9704 Register Result = 0;
9705 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) {
9706 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) {
9707 Result = MRI.createVirtualRegister(
9708 MRI.getRegClass(NewMI->getOperand(0).getReg()));
9709 NewMI->getOperand(I).setReg(Result);
9710 } else if (I == ReplaceOprNum) {
9711 MRI.constrainRegClass(
9712 ReplaceReg,
9713 TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent()));
9714 NewMI->getOperand(I).setReg(ReplaceReg);
9715 }
9716 }
9717 MBB.insert(InsertTo, NewMI);
9718 return Result;
9719 }
9720
createRemainingIterationsGreaterCondition(int TC,MachineBasicBlock & MBB,SmallVectorImpl<MachineOperand> & Cond,DenseMap<MachineInstr *,MachineInstr * > & LastStage0Insts)9721 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition(
9722 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond,
9723 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) {
9724 // Create and accumulate conditions for next TC iterations.
9725 // Example:
9726 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last
9727 // # iteration of the kernel
9728 //
9729 // # insert the following instructions
9730 // cond = CSINCXr 0, 0, C, implicit $nzcv
9731 // counter = ADDXri counter, 1 # clone from this->Update
9732 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp
9733 // cond = CSINCXr cond, cond, C, implicit $nzcv
9734 // ... (repeat TC times)
9735 // SUBSXri cond, 0, implicit-def $nzcv
9736
9737 assert(CondBranch->getOpcode() == AArch64::Bcc);
9738 // CondCode to exit the loop
9739 AArch64CC::CondCode CC =
9740 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm();
9741 if (CondBranch->getOperand(1).getMBB() == LoopBB)
9742 CC = AArch64CC::getInvertedCondCode(CC);
9743
9744 // Accumulate conditions to exit the loop
9745 Register AccCond = AArch64::XZR;
9746
9747 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned.
9748 auto AccumulateCond = [&](Register CurCond,
9749 AArch64CC::CondCode CC) -> Register {
9750 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
9751 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr))
9752 .addReg(NewCond, RegState::Define)
9753 .addReg(CurCond)
9754 .addReg(CurCond)
9755 .addImm(AArch64CC::getInvertedCondCode(CC));
9756 return NewCond;
9757 };
9758
9759 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) {
9760 // Update and Comp for I==0 are already exists in MBB
9761 // (MBB is an unrolled kernel)
9762 Register Counter;
9763 for (int I = 0; I <= TC; ++I) {
9764 Register NextCounter;
9765 if (I != 0)
9766 NextCounter =
9767 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9768
9769 AccCond = AccumulateCond(AccCond, CC);
9770
9771 if (I != TC) {
9772 if (I == 0) {
9773 if (Update != Comp && IsUpdatePriorComp) {
9774 Counter =
9775 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9776 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB,
9777 MBB.end());
9778 } else {
9779 // can use already calculated value
9780 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg();
9781 }
9782 } else if (Update != Comp) {
9783 NextCounter =
9784 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9785 }
9786 }
9787 Counter = NextCounter;
9788 }
9789 } else {
9790 Register Counter;
9791 if (LastStage0Insts.empty()) {
9792 // use initial counter value (testing if the trip count is sufficient to
9793 // be executed by pipelined code)
9794 Counter = Init;
9795 if (IsUpdatePriorComp)
9796 Counter =
9797 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9798 } else {
9799 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block.
9800 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg();
9801 }
9802
9803 for (int I = 0; I <= TC; ++I) {
9804 Register NextCounter;
9805 NextCounter =
9806 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end());
9807 AccCond = AccumulateCond(AccCond, CC);
9808 if (I != TC && Update != Comp)
9809 NextCounter =
9810 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end());
9811 Counter = NextCounter;
9812 }
9813 }
9814
9815 // If AccCond == 0, the remainder is greater than TC.
9816 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri))
9817 .addReg(AArch64::XZR, RegState::Define | RegState::Dead)
9818 .addReg(AccCond)
9819 .addImm(0)
9820 .addImm(0);
9821 Cond.clear();
9822 Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ));
9823 }
9824
extractPhiReg(const MachineInstr & Phi,const MachineBasicBlock * MBB,Register & RegMBB,Register & RegOther)9825 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB,
9826 Register &RegMBB, Register &RegOther) {
9827 assert(Phi.getNumOperands() == 5);
9828 if (Phi.getOperand(2).getMBB() == MBB) {
9829 RegMBB = Phi.getOperand(1).getReg();
9830 RegOther = Phi.getOperand(3).getReg();
9831 } else {
9832 assert(Phi.getOperand(4).getMBB() == MBB);
9833 RegMBB = Phi.getOperand(3).getReg();
9834 RegOther = Phi.getOperand(1).getReg();
9835 }
9836 }
9837
isDefinedOutside(Register Reg,const MachineBasicBlock * BB)9838 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) {
9839 if (!Reg.isVirtual())
9840 return false;
9841 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
9842 return MRI.getVRegDef(Reg)->getParent() != BB;
9843 }
9844
9845 /// If Reg is an induction variable, return true and set some parameters
getIndVarInfo(Register Reg,const MachineBasicBlock * LoopBB,MachineInstr * & UpdateInst,unsigned & UpdateCounterOprNum,Register & InitReg,bool & IsUpdatePriorComp)9846 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB,
9847 MachineInstr *&UpdateInst,
9848 unsigned &UpdateCounterOprNum, Register &InitReg,
9849 bool &IsUpdatePriorComp) {
9850 // Example:
9851 //
9852 // Preheader:
9853 // InitReg = ...
9854 // LoopBB:
9855 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB)
9856 // Reg = COPY Reg0 ; COPY is ignored.
9857 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value.
9858 // ; Reg is the value calculated in the previous
9859 // ; iteration, so IsUpdatePriorComp == false.
9860
9861 if (LoopBB->pred_size() != 2)
9862 return false;
9863 if (!Reg.isVirtual())
9864 return false;
9865 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9866 UpdateInst = nullptr;
9867 UpdateCounterOprNum = 0;
9868 InitReg = 0;
9869 IsUpdatePriorComp = true;
9870 Register CurReg = Reg;
9871 while (true) {
9872 MachineInstr *Def = MRI.getVRegDef(CurReg);
9873 if (Def->getParent() != LoopBB)
9874 return false;
9875 if (Def->isCopy()) {
9876 // Ignore copy instructions unless they contain subregisters
9877 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg())
9878 return false;
9879 CurReg = Def->getOperand(1).getReg();
9880 } else if (Def->isPHI()) {
9881 if (InitReg != 0)
9882 return false;
9883 if (!UpdateInst)
9884 IsUpdatePriorComp = false;
9885 extractPhiReg(*Def, LoopBB, CurReg, InitReg);
9886 } else {
9887 if (UpdateInst)
9888 return false;
9889 switch (Def->getOpcode()) {
9890 case AArch64::ADDSXri:
9891 case AArch64::ADDSWri:
9892 case AArch64::SUBSXri:
9893 case AArch64::SUBSWri:
9894 case AArch64::ADDXri:
9895 case AArch64::ADDWri:
9896 case AArch64::SUBXri:
9897 case AArch64::SUBWri:
9898 UpdateInst = Def;
9899 UpdateCounterOprNum = 1;
9900 break;
9901 case AArch64::ADDSXrr:
9902 case AArch64::ADDSWrr:
9903 case AArch64::SUBSXrr:
9904 case AArch64::SUBSWrr:
9905 case AArch64::ADDXrr:
9906 case AArch64::ADDWrr:
9907 case AArch64::SUBXrr:
9908 case AArch64::SUBWrr:
9909 UpdateInst = Def;
9910 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB))
9911 UpdateCounterOprNum = 1;
9912 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB))
9913 UpdateCounterOprNum = 2;
9914 else
9915 return false;
9916 break;
9917 default:
9918 return false;
9919 }
9920 CurReg = Def->getOperand(UpdateCounterOprNum).getReg();
9921 }
9922
9923 if (!CurReg.isVirtual())
9924 return false;
9925 if (Reg == CurReg)
9926 break;
9927 }
9928
9929 if (!UpdateInst)
9930 return false;
9931
9932 return true;
9933 }
9934
9935 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock * LoopBB) const9936 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9937 // Accept loops that meet the following conditions
9938 // * The conditional branch is BCC
9939 // * The compare instruction is ADDS/SUBS/WHILEXX
9940 // * One operand of the compare is an induction variable and the other is a
9941 // loop invariant value
9942 // * The induction variable is incremented/decremented by a single instruction
9943 // * Does not contain CALL or instructions which have unmodeled side effects
9944
9945 for (MachineInstr &MI : *LoopBB)
9946 if (MI.isCall() || MI.hasUnmodeledSideEffects())
9947 // This instruction may use NZCV, which interferes with the instruction to
9948 // be inserted for loop control.
9949 return nullptr;
9950
9951 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9952 SmallVector<MachineOperand, 4> Cond;
9953 if (analyzeBranch(*LoopBB, TBB, FBB, Cond))
9954 return nullptr;
9955
9956 // Infinite loops are not supported
9957 if (TBB == LoopBB && FBB == LoopBB)
9958 return nullptr;
9959
9960 // Must be conditional branch
9961 if (TBB != LoopBB && FBB == nullptr)
9962 return nullptr;
9963
9964 assert((TBB == LoopBB || FBB == LoopBB) &&
9965 "The Loop must be a single-basic-block loop");
9966
9967 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9968 const TargetRegisterInfo &TRI = getRegisterInfo();
9969
9970 if (CondBranch->getOpcode() != AArch64::Bcc)
9971 return nullptr;
9972
9973 // Normalization for createTripCountGreaterCondition()
9974 if (TBB == LoopBB)
9975 reverseBranchCondition(Cond);
9976
9977 MachineInstr *Comp = nullptr;
9978 unsigned CompCounterOprNum = 0;
9979 for (MachineInstr &MI : reverse(*LoopBB)) {
9980 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9981 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the
9982 // operands is a loop invariant value
9983
9984 switch (MI.getOpcode()) {
9985 case AArch64::SUBSXri:
9986 case AArch64::SUBSWri:
9987 case AArch64::ADDSXri:
9988 case AArch64::ADDSWri:
9989 Comp = &MI;
9990 CompCounterOprNum = 1;
9991 break;
9992 case AArch64::ADDSWrr:
9993 case AArch64::ADDSXrr:
9994 case AArch64::SUBSWrr:
9995 case AArch64::SUBSXrr:
9996 Comp = &MI;
9997 break;
9998 default:
9999 if (isWhileOpcode(MI.getOpcode())) {
10000 Comp = &MI;
10001 break;
10002 }
10003 return nullptr;
10004 }
10005
10006 if (CompCounterOprNum == 0) {
10007 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB))
10008 CompCounterOprNum = 2;
10009 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB))
10010 CompCounterOprNum = 1;
10011 else
10012 return nullptr;
10013 }
10014 break;
10015 }
10016 }
10017 if (!Comp)
10018 return nullptr;
10019
10020 MachineInstr *Update = nullptr;
10021 Register Init;
10022 bool IsUpdatePriorComp;
10023 unsigned UpdateCounterOprNum;
10024 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB,
10025 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp))
10026 return nullptr;
10027
10028 return std::make_unique<AArch64PipelinerLoopInfo>(
10029 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum,
10030 Init, IsUpdatePriorComp, Cond);
10031 }
10032
10033 #define GET_INSTRINFO_HELPERS
10034 #define GET_INSTRMAP_INFO
10035 #include "AArch64GenInstrInfo.inc"
10036