1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64ExpandImm.h" 15 #include "AArch64FrameLowering.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PointerAuth.h" 18 #include "AArch64Subtarget.h" 19 #include "MCTargetDesc/AArch64AddressingModes.h" 20 #include "MCTargetDesc/AArch64MCTargetDesc.h" 21 #include "Utils/AArch64BaseInfo.h" 22 #include "llvm/ADT/ArrayRef.h" 23 #include "llvm/ADT/STLExtras.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/CodeGen/LivePhysRegs.h" 26 #include "llvm/CodeGen/MachineBasicBlock.h" 27 #include "llvm/CodeGen/MachineCombinerPattern.h" 28 #include "llvm/CodeGen/MachineFrameInfo.h" 29 #include "llvm/CodeGen/MachineFunction.h" 30 #include "llvm/CodeGen/MachineInstr.h" 31 #include "llvm/CodeGen/MachineInstrBuilder.h" 32 #include "llvm/CodeGen/MachineMemOperand.h" 33 #include "llvm/CodeGen/MachineModuleInfo.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/RegisterScavenging.h" 37 #include "llvm/CodeGen/StackMaps.h" 38 #include "llvm/CodeGen/TargetRegisterInfo.h" 39 #include "llvm/CodeGen/TargetSubtargetInfo.h" 40 #include "llvm/IR/DebugInfoMetadata.h" 41 #include "llvm/IR/DebugLoc.h" 42 #include "llvm/IR/GlobalValue.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/MC/MCAsmInfo.h" 45 #include "llvm/MC/MCInst.h" 46 #include "llvm/MC/MCInstBuilder.h" 47 #include "llvm/MC/MCInstrDesc.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CodeGen.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/LEB128.h" 53 #include "llvm/Support/MathExtras.h" 54 #include "llvm/Target/TargetMachine.h" 55 #include "llvm/Target/TargetOptions.h" 56 #include <cassert> 57 #include <cstdint> 58 #include <iterator> 59 #include <utility> 60 61 using namespace llvm; 62 63 #define GET_INSTRINFO_CTOR_DTOR 64 #include "AArch64GenInstrInfo.inc" 65 66 static cl::opt<unsigned> TBZDisplacementBits( 67 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 68 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 69 70 static cl::opt<unsigned> CBZDisplacementBits( 71 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 72 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 73 74 static cl::opt<unsigned> 75 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 76 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 77 78 static cl::opt<unsigned> 79 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), 80 cl::desc("Restrict range of B instructions (DEBUG)")); 81 82 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 83 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 84 AArch64::CATCHRET), 85 RI(STI.getTargetTriple()), Subtarget(STI) {} 86 87 /// GetInstSize - Return the number of bytes of code the specified 88 /// instruction may be. This returns the maximum number of bytes. 89 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 90 const MachineBasicBlock &MBB = *MI.getParent(); 91 const MachineFunction *MF = MBB.getParent(); 92 const Function &F = MF->getFunction(); 93 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 94 95 { 96 auto Op = MI.getOpcode(); 97 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 98 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 99 } 100 101 // Meta-instructions emit no code. 102 if (MI.isMetaInstruction()) 103 return 0; 104 105 // FIXME: We currently only handle pseudoinstructions that don't get expanded 106 // before the assembly printer. 107 unsigned NumBytes = 0; 108 const MCInstrDesc &Desc = MI.getDesc(); 109 110 // Size should be preferably set in 111 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 112 // Specific cases handle instructions of variable sizes 113 switch (Desc.getOpcode()) { 114 default: 115 if (Desc.getSize()) 116 return Desc.getSize(); 117 118 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 119 // with fixed constant size but not specified in .td file) is a normal 120 // 4-byte insn. 121 NumBytes = 4; 122 break; 123 case TargetOpcode::STACKMAP: 124 // The upper bound for a stackmap intrinsic is the full length of its shadow 125 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 126 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 127 break; 128 case TargetOpcode::PATCHPOINT: 129 // The size of the patchpoint intrinsic is the number of bytes requested 130 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 131 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 132 break; 133 case TargetOpcode::STATEPOINT: 134 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 135 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 136 // No patch bytes means a normal call inst is emitted 137 if (NumBytes == 0) 138 NumBytes = 4; 139 break; 140 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 141 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER 142 // instructions are expanded to the specified number of NOPs. Otherwise, 143 // they are expanded to 36-byte XRay sleds. 144 NumBytes = 145 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4; 146 break; 147 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 148 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: 149 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 150 NumBytes = 36; 151 break; 152 case TargetOpcode::PATCHABLE_EVENT_CALL: 153 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment). 154 NumBytes = 24; 155 break; 156 157 case AArch64::SPACE: 158 NumBytes = MI.getOperand(1).getImm(); 159 break; 160 case TargetOpcode::BUNDLE: 161 NumBytes = getInstBundleLength(MI); 162 break; 163 } 164 165 return NumBytes; 166 } 167 168 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 169 unsigned Size = 0; 170 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 171 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 172 while (++I != E && I->isInsideBundle()) { 173 assert(!I->isBundle() && "No nested bundle!"); 174 Size += getInstSizeInBytes(*I); 175 } 176 return Size; 177 } 178 179 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 180 SmallVectorImpl<MachineOperand> &Cond) { 181 // Block ends with fall-through condbranch. 182 switch (LastInst->getOpcode()) { 183 default: 184 llvm_unreachable("Unknown branch instruction?"); 185 case AArch64::Bcc: 186 Target = LastInst->getOperand(1).getMBB(); 187 Cond.push_back(LastInst->getOperand(0)); 188 break; 189 case AArch64::CBZW: 190 case AArch64::CBZX: 191 case AArch64::CBNZW: 192 case AArch64::CBNZX: 193 Target = LastInst->getOperand(1).getMBB(); 194 Cond.push_back(MachineOperand::CreateImm(-1)); 195 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 196 Cond.push_back(LastInst->getOperand(0)); 197 break; 198 case AArch64::TBZW: 199 case AArch64::TBZX: 200 case AArch64::TBNZW: 201 case AArch64::TBNZX: 202 Target = LastInst->getOperand(2).getMBB(); 203 Cond.push_back(MachineOperand::CreateImm(-1)); 204 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 205 Cond.push_back(LastInst->getOperand(0)); 206 Cond.push_back(LastInst->getOperand(1)); 207 } 208 } 209 210 static unsigned getBranchDisplacementBits(unsigned Opc) { 211 switch (Opc) { 212 default: 213 llvm_unreachable("unexpected opcode!"); 214 case AArch64::B: 215 return BDisplacementBits; 216 case AArch64::TBNZW: 217 case AArch64::TBZW: 218 case AArch64::TBNZX: 219 case AArch64::TBZX: 220 return TBZDisplacementBits; 221 case AArch64::CBNZW: 222 case AArch64::CBZW: 223 case AArch64::CBNZX: 224 case AArch64::CBZX: 225 return CBZDisplacementBits; 226 case AArch64::Bcc: 227 return BCCDisplacementBits; 228 } 229 } 230 231 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 232 int64_t BrOffset) const { 233 unsigned Bits = getBranchDisplacementBits(BranchOp); 234 assert(Bits >= 3 && "max branch displacement must be enough to jump" 235 "over conditional branch expansion"); 236 return isIntN(Bits, BrOffset / 4); 237 } 238 239 MachineBasicBlock * 240 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 241 switch (MI.getOpcode()) { 242 default: 243 llvm_unreachable("unexpected opcode!"); 244 case AArch64::B: 245 return MI.getOperand(0).getMBB(); 246 case AArch64::TBZW: 247 case AArch64::TBNZW: 248 case AArch64::TBZX: 249 case AArch64::TBNZX: 250 return MI.getOperand(2).getMBB(); 251 case AArch64::CBZW: 252 case AArch64::CBNZW: 253 case AArch64::CBZX: 254 case AArch64::CBNZX: 255 case AArch64::Bcc: 256 return MI.getOperand(1).getMBB(); 257 } 258 } 259 260 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 261 MachineBasicBlock &NewDestBB, 262 MachineBasicBlock &RestoreBB, 263 const DebugLoc &DL, 264 int64_t BrOffset, 265 RegScavenger *RS) const { 266 assert(RS && "RegScavenger required for long branching"); 267 assert(MBB.empty() && 268 "new block should be inserted for expanding unconditional branch"); 269 assert(MBB.pred_size() == 1); 270 assert(RestoreBB.empty() && 271 "restore block should be inserted for restoring clobbered registers"); 272 273 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) { 274 // Offsets outside of the signed 33-bit range are not supported for ADRP + 275 // ADD. 276 if (!isInt<33>(BrOffset)) 277 report_fatal_error( 278 "Branch offsets outside of the signed 33-bit range not supported"); 279 280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg) 281 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE); 282 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg) 283 .addReg(Reg) 284 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC) 285 .addImm(0); 286 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg); 287 }; 288 289 RS->enterBasicBlockEnd(MBB); 290 // If X16 is unused, we can rely on the linker to insert a range extension 291 // thunk if NewDestBB is out of range of a single B instruction. 292 constexpr Register Reg = AArch64::X16; 293 if (!RS->isRegUsed(Reg)) { 294 insertUnconditionalBranch(MBB, &NewDestBB, DL); 295 RS->setRegUsed(Reg); 296 return; 297 } 298 299 // If there's a free register and it's worth inflating the code size, 300 // manually insert the indirect branch. 301 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass); 302 if (Scavenged != AArch64::NoRegister && 303 MBB.getSectionID() == MBBSectionID::ColdSectionID) { 304 buildIndirectBranch(Scavenged, NewDestBB); 305 RS->setRegUsed(Scavenged); 306 return; 307 } 308 309 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible 310 // with red zones. 311 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>(); 312 if (!AFI || AFI->hasRedZone().value_or(true)) 313 report_fatal_error( 314 "Unable to insert indirect branch inside function that has red zone"); 315 316 // Otherwise, spill X16 and defer range extension to the linker. 317 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre)) 318 .addReg(AArch64::SP, RegState::Define) 319 .addReg(Reg) 320 .addReg(AArch64::SP) 321 .addImm(-16); 322 323 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB); 324 325 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost)) 326 .addReg(AArch64::SP, RegState::Define) 327 .addReg(Reg, RegState::Define) 328 .addReg(AArch64::SP) 329 .addImm(16); 330 } 331 332 // Branch analysis. 333 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 334 MachineBasicBlock *&TBB, 335 MachineBasicBlock *&FBB, 336 SmallVectorImpl<MachineOperand> &Cond, 337 bool AllowModify) const { 338 // If the block has no terminators, it just falls into the block after it. 339 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 340 if (I == MBB.end()) 341 return false; 342 343 // Skip over SpeculationBarrierEndBB terminators 344 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 345 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 346 --I; 347 } 348 349 if (!isUnpredicatedTerminator(*I)) 350 return false; 351 352 // Get the last instruction in the block. 353 MachineInstr *LastInst = &*I; 354 355 // If there is only one terminator instruction, process it. 356 unsigned LastOpc = LastInst->getOpcode(); 357 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 358 if (isUncondBranchOpcode(LastOpc)) { 359 TBB = LastInst->getOperand(0).getMBB(); 360 return false; 361 } 362 if (isCondBranchOpcode(LastOpc)) { 363 // Block ends with fall-through condbranch. 364 parseCondBranch(LastInst, TBB, Cond); 365 return false; 366 } 367 return true; // Can't handle indirect branch. 368 } 369 370 // Get the instruction before it if it is a terminator. 371 MachineInstr *SecondLastInst = &*I; 372 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 373 374 // If AllowModify is true and the block ends with two or more unconditional 375 // branches, delete all but the first unconditional branch. 376 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 377 while (isUncondBranchOpcode(SecondLastOpc)) { 378 LastInst->eraseFromParent(); 379 LastInst = SecondLastInst; 380 LastOpc = LastInst->getOpcode(); 381 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 382 // Return now the only terminator is an unconditional branch. 383 TBB = LastInst->getOperand(0).getMBB(); 384 return false; 385 } 386 SecondLastInst = &*I; 387 SecondLastOpc = SecondLastInst->getOpcode(); 388 } 389 } 390 391 // If we're allowed to modify and the block ends in a unconditional branch 392 // which could simply fallthrough, remove the branch. (Note: This case only 393 // matters when we can't understand the whole sequence, otherwise it's also 394 // handled by BranchFolding.cpp.) 395 if (AllowModify && isUncondBranchOpcode(LastOpc) && 396 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 397 LastInst->eraseFromParent(); 398 LastInst = SecondLastInst; 399 LastOpc = LastInst->getOpcode(); 400 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 401 assert(!isUncondBranchOpcode(LastOpc) && 402 "unreachable unconditional branches removed above"); 403 404 if (isCondBranchOpcode(LastOpc)) { 405 // Block ends with fall-through condbranch. 406 parseCondBranch(LastInst, TBB, Cond); 407 return false; 408 } 409 return true; // Can't handle indirect branch. 410 } 411 SecondLastInst = &*I; 412 SecondLastOpc = SecondLastInst->getOpcode(); 413 } 414 415 // If there are three terminators, we don't know what sort of block this is. 416 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 417 return true; 418 419 // If the block ends with a B and a Bcc, handle it. 420 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 421 parseCondBranch(SecondLastInst, TBB, Cond); 422 FBB = LastInst->getOperand(0).getMBB(); 423 return false; 424 } 425 426 // If the block ends with two unconditional branches, handle it. The second 427 // one is not executed, so remove it. 428 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 429 TBB = SecondLastInst->getOperand(0).getMBB(); 430 I = LastInst; 431 if (AllowModify) 432 I->eraseFromParent(); 433 return false; 434 } 435 436 // ...likewise if it ends with an indirect branch followed by an unconditional 437 // branch. 438 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 439 I = LastInst; 440 if (AllowModify) 441 I->eraseFromParent(); 442 return true; 443 } 444 445 // Otherwise, can't handle this. 446 return true; 447 } 448 449 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 450 MachineBranchPredicate &MBP, 451 bool AllowModify) const { 452 // For the moment, handle only a block which ends with a cb(n)zx followed by 453 // a fallthrough. Why this? Because it is a common form. 454 // TODO: Should we handle b.cc? 455 456 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 457 if (I == MBB.end()) 458 return true; 459 460 // Skip over SpeculationBarrierEndBB terminators 461 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 462 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 463 --I; 464 } 465 466 if (!isUnpredicatedTerminator(*I)) 467 return true; 468 469 // Get the last instruction in the block. 470 MachineInstr *LastInst = &*I; 471 unsigned LastOpc = LastInst->getOpcode(); 472 if (!isCondBranchOpcode(LastOpc)) 473 return true; 474 475 switch (LastOpc) { 476 default: 477 return true; 478 case AArch64::CBZW: 479 case AArch64::CBZX: 480 case AArch64::CBNZW: 481 case AArch64::CBNZX: 482 break; 483 }; 484 485 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 486 assert(MBP.TrueDest && "expected!"); 487 MBP.FalseDest = MBB.getNextNode(); 488 489 MBP.ConditionDef = nullptr; 490 MBP.SingleUseCondition = false; 491 492 MBP.LHS = LastInst->getOperand(0); 493 MBP.RHS = MachineOperand::CreateImm(0); 494 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 495 : MachineBranchPredicate::PRED_EQ; 496 return false; 497 } 498 499 bool AArch64InstrInfo::reverseBranchCondition( 500 SmallVectorImpl<MachineOperand> &Cond) const { 501 if (Cond[0].getImm() != -1) { 502 // Regular Bcc 503 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 504 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 505 } else { 506 // Folded compare-and-branch 507 switch (Cond[1].getImm()) { 508 default: 509 llvm_unreachable("Unknown conditional branch!"); 510 case AArch64::CBZW: 511 Cond[1].setImm(AArch64::CBNZW); 512 break; 513 case AArch64::CBNZW: 514 Cond[1].setImm(AArch64::CBZW); 515 break; 516 case AArch64::CBZX: 517 Cond[1].setImm(AArch64::CBNZX); 518 break; 519 case AArch64::CBNZX: 520 Cond[1].setImm(AArch64::CBZX); 521 break; 522 case AArch64::TBZW: 523 Cond[1].setImm(AArch64::TBNZW); 524 break; 525 case AArch64::TBNZW: 526 Cond[1].setImm(AArch64::TBZW); 527 break; 528 case AArch64::TBZX: 529 Cond[1].setImm(AArch64::TBNZX); 530 break; 531 case AArch64::TBNZX: 532 Cond[1].setImm(AArch64::TBZX); 533 break; 534 } 535 } 536 537 return false; 538 } 539 540 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 541 int *BytesRemoved) const { 542 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 543 if (I == MBB.end()) 544 return 0; 545 546 if (!isUncondBranchOpcode(I->getOpcode()) && 547 !isCondBranchOpcode(I->getOpcode())) 548 return 0; 549 550 // Remove the branch. 551 I->eraseFromParent(); 552 553 I = MBB.end(); 554 555 if (I == MBB.begin()) { 556 if (BytesRemoved) 557 *BytesRemoved = 4; 558 return 1; 559 } 560 --I; 561 if (!isCondBranchOpcode(I->getOpcode())) { 562 if (BytesRemoved) 563 *BytesRemoved = 4; 564 return 1; 565 } 566 567 // Remove the branch. 568 I->eraseFromParent(); 569 if (BytesRemoved) 570 *BytesRemoved = 8; 571 572 return 2; 573 } 574 575 void AArch64InstrInfo::instantiateCondBranch( 576 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 577 ArrayRef<MachineOperand> Cond) const { 578 if (Cond[0].getImm() != -1) { 579 // Regular Bcc 580 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 581 } else { 582 // Folded compare-and-branch 583 // Note that we use addOperand instead of addReg to keep the flags. 584 const MachineInstrBuilder MIB = 585 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 586 if (Cond.size() > 3) 587 MIB.addImm(Cond[3].getImm()); 588 MIB.addMBB(TBB); 589 } 590 } 591 592 unsigned AArch64InstrInfo::insertBranch( 593 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 594 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 595 // Shouldn't be a fall through. 596 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 597 598 if (!FBB) { 599 if (Cond.empty()) // Unconditional branch? 600 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 601 else 602 instantiateCondBranch(MBB, DL, TBB, Cond); 603 604 if (BytesAdded) 605 *BytesAdded = 4; 606 607 return 1; 608 } 609 610 // Two-way conditional branch. 611 instantiateCondBranch(MBB, DL, TBB, Cond); 612 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 613 614 if (BytesAdded) 615 *BytesAdded = 8; 616 617 return 2; 618 } 619 620 // Find the original register that VReg is copied from. 621 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 622 while (Register::isVirtualRegister(VReg)) { 623 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 624 if (!DefMI->isFullCopy()) 625 return VReg; 626 VReg = DefMI->getOperand(1).getReg(); 627 } 628 return VReg; 629 } 630 631 // Determine if VReg is defined by an instruction that can be folded into a 632 // csel instruction. If so, return the folded opcode, and the replacement 633 // register. 634 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 635 unsigned *NewVReg = nullptr) { 636 VReg = removeCopies(MRI, VReg); 637 if (!Register::isVirtualRegister(VReg)) 638 return 0; 639 640 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 641 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 642 unsigned Opc = 0; 643 unsigned SrcOpNum = 0; 644 switch (DefMI->getOpcode()) { 645 case AArch64::ADDSXri: 646 case AArch64::ADDSWri: 647 // if NZCV is used, do not fold. 648 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 649 true) == -1) 650 return 0; 651 // fall-through to ADDXri and ADDWri. 652 [[fallthrough]]; 653 case AArch64::ADDXri: 654 case AArch64::ADDWri: 655 // add x, 1 -> csinc. 656 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 657 DefMI->getOperand(3).getImm() != 0) 658 return 0; 659 SrcOpNum = 1; 660 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 661 break; 662 663 case AArch64::ORNXrr: 664 case AArch64::ORNWrr: { 665 // not x -> csinv, represented as orn dst, xzr, src. 666 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 667 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 668 return 0; 669 SrcOpNum = 2; 670 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 671 break; 672 } 673 674 case AArch64::SUBSXrr: 675 case AArch64::SUBSWrr: 676 // if NZCV is used, do not fold. 677 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 678 true) == -1) 679 return 0; 680 // fall-through to SUBXrr and SUBWrr. 681 [[fallthrough]]; 682 case AArch64::SUBXrr: 683 case AArch64::SUBWrr: { 684 // neg x -> csneg, represented as sub dst, xzr, src. 685 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 686 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 687 return 0; 688 SrcOpNum = 2; 689 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 690 break; 691 } 692 default: 693 return 0; 694 } 695 assert(Opc && SrcOpNum && "Missing parameters"); 696 697 if (NewVReg) 698 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 699 return Opc; 700 } 701 702 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 703 ArrayRef<MachineOperand> Cond, 704 Register DstReg, Register TrueReg, 705 Register FalseReg, int &CondCycles, 706 int &TrueCycles, 707 int &FalseCycles) const { 708 // Check register classes. 709 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 710 const TargetRegisterClass *RC = 711 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 712 if (!RC) 713 return false; 714 715 // Also need to check the dest regclass, in case we're trying to optimize 716 // something like: 717 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 718 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 719 return false; 720 721 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 722 unsigned ExtraCondLat = Cond.size() != 1; 723 724 // GPRs are handled by csel. 725 // FIXME: Fold in x+1, -x, and ~x when applicable. 726 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 727 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 728 // Single-cycle csel, csinc, csinv, and csneg. 729 CondCycles = 1 + ExtraCondLat; 730 TrueCycles = FalseCycles = 1; 731 if (canFoldIntoCSel(MRI, TrueReg)) 732 TrueCycles = 0; 733 else if (canFoldIntoCSel(MRI, FalseReg)) 734 FalseCycles = 0; 735 return true; 736 } 737 738 // Scalar floating point is handled by fcsel. 739 // FIXME: Form fabs, fmin, and fmax when applicable. 740 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 741 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 742 CondCycles = 5 + ExtraCondLat; 743 TrueCycles = FalseCycles = 2; 744 return true; 745 } 746 747 // Can't do vectors. 748 return false; 749 } 750 751 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 752 MachineBasicBlock::iterator I, 753 const DebugLoc &DL, Register DstReg, 754 ArrayRef<MachineOperand> Cond, 755 Register TrueReg, Register FalseReg) const { 756 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 757 758 // Parse the condition code, see parseCondBranch() above. 759 AArch64CC::CondCode CC; 760 switch (Cond.size()) { 761 default: 762 llvm_unreachable("Unknown condition opcode in Cond"); 763 case 1: // b.cc 764 CC = AArch64CC::CondCode(Cond[0].getImm()); 765 break; 766 case 3: { // cbz/cbnz 767 // We must insert a compare against 0. 768 bool Is64Bit; 769 switch (Cond[1].getImm()) { 770 default: 771 llvm_unreachable("Unknown branch opcode in Cond"); 772 case AArch64::CBZW: 773 Is64Bit = false; 774 CC = AArch64CC::EQ; 775 break; 776 case AArch64::CBZX: 777 Is64Bit = true; 778 CC = AArch64CC::EQ; 779 break; 780 case AArch64::CBNZW: 781 Is64Bit = false; 782 CC = AArch64CC::NE; 783 break; 784 case AArch64::CBNZX: 785 Is64Bit = true; 786 CC = AArch64CC::NE; 787 break; 788 } 789 Register SrcReg = Cond[2].getReg(); 790 if (Is64Bit) { 791 // cmp reg, #0 is actually subs xzr, reg, #0. 792 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 793 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 794 .addReg(SrcReg) 795 .addImm(0) 796 .addImm(0); 797 } else { 798 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 799 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 800 .addReg(SrcReg) 801 .addImm(0) 802 .addImm(0); 803 } 804 break; 805 } 806 case 4: { // tbz/tbnz 807 // We must insert a tst instruction. 808 switch (Cond[1].getImm()) { 809 default: 810 llvm_unreachable("Unknown branch opcode in Cond"); 811 case AArch64::TBZW: 812 case AArch64::TBZX: 813 CC = AArch64CC::EQ; 814 break; 815 case AArch64::TBNZW: 816 case AArch64::TBNZX: 817 CC = AArch64CC::NE; 818 break; 819 } 820 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 821 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 822 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 823 .addReg(Cond[2].getReg()) 824 .addImm( 825 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 826 else 827 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 828 .addReg(Cond[2].getReg()) 829 .addImm( 830 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 831 break; 832 } 833 } 834 835 unsigned Opc = 0; 836 const TargetRegisterClass *RC = nullptr; 837 bool TryFold = false; 838 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 839 RC = &AArch64::GPR64RegClass; 840 Opc = AArch64::CSELXr; 841 TryFold = true; 842 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 843 RC = &AArch64::GPR32RegClass; 844 Opc = AArch64::CSELWr; 845 TryFold = true; 846 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 847 RC = &AArch64::FPR64RegClass; 848 Opc = AArch64::FCSELDrrr; 849 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 850 RC = &AArch64::FPR32RegClass; 851 Opc = AArch64::FCSELSrrr; 852 } 853 assert(RC && "Unsupported regclass"); 854 855 // Try folding simple instructions into the csel. 856 if (TryFold) { 857 unsigned NewVReg = 0; 858 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 859 if (FoldedOpc) { 860 // The folded opcodes csinc, csinc and csneg apply the operation to 861 // FalseReg, so we need to invert the condition. 862 CC = AArch64CC::getInvertedCondCode(CC); 863 TrueReg = FalseReg; 864 } else 865 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 866 867 // Fold the operation. Leave any dead instructions for DCE to clean up. 868 if (FoldedOpc) { 869 FalseReg = NewVReg; 870 Opc = FoldedOpc; 871 // The extends the live range of NewVReg. 872 MRI.clearKillFlags(NewVReg); 873 } 874 } 875 876 // Pull all virtual register into the appropriate class. 877 MRI.constrainRegClass(TrueReg, RC); 878 MRI.constrainRegClass(FalseReg, RC); 879 880 // Insert the csel. 881 BuildMI(MBB, I, DL, get(Opc), DstReg) 882 .addReg(TrueReg) 883 .addReg(FalseReg) 884 .addImm(CC); 885 } 886 887 // Return true if Imm can be loaded into a register by a "cheap" sequence of 888 // instructions. For now, "cheap" means at most two instructions. 889 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { 890 if (BitSize == 32) 891 return true; 892 893 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed"); 894 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm()); 895 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is; 896 AArch64_IMM::expandMOVImm(Imm, BitSize, Is); 897 898 return Is.size() <= 2; 899 } 900 901 // FIXME: this implementation should be micro-architecture dependent, so a 902 // micro-architecture target hook should be introduced here in future. 903 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 904 if (Subtarget.hasExynosCheapAsMoveHandling()) { 905 if (isExynosCheapAsMove(MI)) 906 return true; 907 return MI.isAsCheapAsAMove(); 908 } 909 910 switch (MI.getOpcode()) { 911 default: 912 return MI.isAsCheapAsAMove(); 913 914 case AArch64::ADDWrs: 915 case AArch64::ADDXrs: 916 case AArch64::SUBWrs: 917 case AArch64::SUBXrs: 918 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4; 919 920 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 921 // ORRXri, it is as cheap as MOV. 922 // Likewise if it can be expanded to MOVZ/MOVN/MOVK. 923 case AArch64::MOVi32imm: 924 return isCheapImmediate(MI, 32); 925 case AArch64::MOVi64imm: 926 return isCheapImmediate(MI, 64); 927 } 928 } 929 930 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 931 switch (MI.getOpcode()) { 932 default: 933 return false; 934 935 case AArch64::ADDWrs: 936 case AArch64::ADDXrs: 937 case AArch64::ADDSWrs: 938 case AArch64::ADDSXrs: { 939 unsigned Imm = MI.getOperand(3).getImm(); 940 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 941 if (ShiftVal == 0) 942 return true; 943 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 944 } 945 946 case AArch64::ADDWrx: 947 case AArch64::ADDXrx: 948 case AArch64::ADDXrx64: 949 case AArch64::ADDSWrx: 950 case AArch64::ADDSXrx: 951 case AArch64::ADDSXrx64: { 952 unsigned Imm = MI.getOperand(3).getImm(); 953 switch (AArch64_AM::getArithExtendType(Imm)) { 954 default: 955 return false; 956 case AArch64_AM::UXTB: 957 case AArch64_AM::UXTH: 958 case AArch64_AM::UXTW: 959 case AArch64_AM::UXTX: 960 return AArch64_AM::getArithShiftValue(Imm) <= 4; 961 } 962 } 963 964 case AArch64::SUBWrs: 965 case AArch64::SUBSWrs: { 966 unsigned Imm = MI.getOperand(3).getImm(); 967 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 968 return ShiftVal == 0 || 969 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 970 } 971 972 case AArch64::SUBXrs: 973 case AArch64::SUBSXrs: { 974 unsigned Imm = MI.getOperand(3).getImm(); 975 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 976 return ShiftVal == 0 || 977 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 978 } 979 980 case AArch64::SUBWrx: 981 case AArch64::SUBXrx: 982 case AArch64::SUBXrx64: 983 case AArch64::SUBSWrx: 984 case AArch64::SUBSXrx: 985 case AArch64::SUBSXrx64: { 986 unsigned Imm = MI.getOperand(3).getImm(); 987 switch (AArch64_AM::getArithExtendType(Imm)) { 988 default: 989 return false; 990 case AArch64_AM::UXTB: 991 case AArch64_AM::UXTH: 992 case AArch64_AM::UXTW: 993 case AArch64_AM::UXTX: 994 return AArch64_AM::getArithShiftValue(Imm) == 0; 995 } 996 } 997 998 case AArch64::LDRBBroW: 999 case AArch64::LDRBBroX: 1000 case AArch64::LDRBroW: 1001 case AArch64::LDRBroX: 1002 case AArch64::LDRDroW: 1003 case AArch64::LDRDroX: 1004 case AArch64::LDRHHroW: 1005 case AArch64::LDRHHroX: 1006 case AArch64::LDRHroW: 1007 case AArch64::LDRHroX: 1008 case AArch64::LDRQroW: 1009 case AArch64::LDRQroX: 1010 case AArch64::LDRSBWroW: 1011 case AArch64::LDRSBWroX: 1012 case AArch64::LDRSBXroW: 1013 case AArch64::LDRSBXroX: 1014 case AArch64::LDRSHWroW: 1015 case AArch64::LDRSHWroX: 1016 case AArch64::LDRSHXroW: 1017 case AArch64::LDRSHXroX: 1018 case AArch64::LDRSWroW: 1019 case AArch64::LDRSWroX: 1020 case AArch64::LDRSroW: 1021 case AArch64::LDRSroX: 1022 case AArch64::LDRWroW: 1023 case AArch64::LDRWroX: 1024 case AArch64::LDRXroW: 1025 case AArch64::LDRXroX: 1026 case AArch64::PRFMroW: 1027 case AArch64::PRFMroX: 1028 case AArch64::STRBBroW: 1029 case AArch64::STRBBroX: 1030 case AArch64::STRBroW: 1031 case AArch64::STRBroX: 1032 case AArch64::STRDroW: 1033 case AArch64::STRDroX: 1034 case AArch64::STRHHroW: 1035 case AArch64::STRHHroX: 1036 case AArch64::STRHroW: 1037 case AArch64::STRHroX: 1038 case AArch64::STRQroW: 1039 case AArch64::STRQroX: 1040 case AArch64::STRSroW: 1041 case AArch64::STRSroX: 1042 case AArch64::STRWroW: 1043 case AArch64::STRWroX: 1044 case AArch64::STRXroW: 1045 case AArch64::STRXroX: { 1046 unsigned IsSigned = MI.getOperand(3).getImm(); 1047 return !IsSigned; 1048 } 1049 } 1050 } 1051 1052 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1053 unsigned Opc = MI.getOpcode(); 1054 switch (Opc) { 1055 default: 1056 return false; 1057 case AArch64::SEH_StackAlloc: 1058 case AArch64::SEH_SaveFPLR: 1059 case AArch64::SEH_SaveFPLR_X: 1060 case AArch64::SEH_SaveReg: 1061 case AArch64::SEH_SaveReg_X: 1062 case AArch64::SEH_SaveRegP: 1063 case AArch64::SEH_SaveRegP_X: 1064 case AArch64::SEH_SaveFReg: 1065 case AArch64::SEH_SaveFReg_X: 1066 case AArch64::SEH_SaveFRegP: 1067 case AArch64::SEH_SaveFRegP_X: 1068 case AArch64::SEH_SetFP: 1069 case AArch64::SEH_AddFP: 1070 case AArch64::SEH_Nop: 1071 case AArch64::SEH_PrologEnd: 1072 case AArch64::SEH_EpilogStart: 1073 case AArch64::SEH_EpilogEnd: 1074 case AArch64::SEH_PACSignLR: 1075 case AArch64::SEH_SaveAnyRegQP: 1076 case AArch64::SEH_SaveAnyRegQPX: 1077 return true; 1078 } 1079 } 1080 1081 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1082 Register &SrcReg, Register &DstReg, 1083 unsigned &SubIdx) const { 1084 switch (MI.getOpcode()) { 1085 default: 1086 return false; 1087 case AArch64::SBFMXri: // aka sxtw 1088 case AArch64::UBFMXri: // aka uxtw 1089 // Check for the 32 -> 64 bit extension case, these instructions can do 1090 // much more. 1091 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1092 return false; 1093 // This is a signed or unsigned 32 -> 64 bit extension. 1094 SrcReg = MI.getOperand(1).getReg(); 1095 DstReg = MI.getOperand(0).getReg(); 1096 SubIdx = AArch64::sub_32; 1097 return true; 1098 } 1099 } 1100 1101 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1102 const MachineInstr &MIa, const MachineInstr &MIb) const { 1103 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1104 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1105 int64_t OffsetA = 0, OffsetB = 0; 1106 TypeSize WidthA(0, false), WidthB(0, false); 1107 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1108 1109 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1110 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1111 1112 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1113 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1114 return false; 1115 1116 // Retrieve the base, offset from the base and width. Width 1117 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1118 // base are identical, and the offset of a lower memory access + 1119 // the width doesn't overlap the offset of a higher memory access, 1120 // then the memory accesses are different. 1121 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1122 // are assumed to have the same scale (vscale). 1123 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1124 WidthA, TRI) && 1125 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1126 WidthB, TRI)) { 1127 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1128 OffsetAIsScalable == OffsetBIsScalable) { 1129 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1130 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1131 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1132 if (LowWidth.isScalable() == OffsetAIsScalable && 1133 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset) 1134 return true; 1135 } 1136 } 1137 return false; 1138 } 1139 1140 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1141 const MachineBasicBlock *MBB, 1142 const MachineFunction &MF) const { 1143 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1144 return true; 1145 1146 // Do not move an instruction that can be recognized as a branch target. 1147 if (hasBTISemantics(MI)) 1148 return true; 1149 1150 switch (MI.getOpcode()) { 1151 case AArch64::HINT: 1152 // CSDB hints are scheduling barriers. 1153 if (MI.getOperand(0).getImm() == 0x14) 1154 return true; 1155 break; 1156 case AArch64::DSB: 1157 case AArch64::ISB: 1158 // DSB and ISB also are scheduling barriers. 1159 return true; 1160 case AArch64::MSRpstatesvcrImm1: 1161 // SMSTART and SMSTOP are also scheduling barriers. 1162 return true; 1163 default:; 1164 } 1165 if (isSEHInstruction(MI)) 1166 return true; 1167 auto Next = std::next(MI.getIterator()); 1168 return Next != MBB->end() && Next->isCFIInstruction(); 1169 } 1170 1171 /// analyzeCompare - For a comparison instruction, return the source registers 1172 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1173 /// Return true if the comparison instruction can be analyzed. 1174 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1175 Register &SrcReg2, int64_t &CmpMask, 1176 int64_t &CmpValue) const { 1177 // The first operand can be a frame index where we'd normally expect a 1178 // register. 1179 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1180 if (!MI.getOperand(1).isReg()) 1181 return false; 1182 1183 switch (MI.getOpcode()) { 1184 default: 1185 break; 1186 case AArch64::PTEST_PP: 1187 case AArch64::PTEST_PP_ANY: 1188 SrcReg = MI.getOperand(0).getReg(); 1189 SrcReg2 = MI.getOperand(1).getReg(); 1190 // Not sure about the mask and value for now... 1191 CmpMask = ~0; 1192 CmpValue = 0; 1193 return true; 1194 case AArch64::SUBSWrr: 1195 case AArch64::SUBSWrs: 1196 case AArch64::SUBSWrx: 1197 case AArch64::SUBSXrr: 1198 case AArch64::SUBSXrs: 1199 case AArch64::SUBSXrx: 1200 case AArch64::ADDSWrr: 1201 case AArch64::ADDSWrs: 1202 case AArch64::ADDSWrx: 1203 case AArch64::ADDSXrr: 1204 case AArch64::ADDSXrs: 1205 case AArch64::ADDSXrx: 1206 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1207 SrcReg = MI.getOperand(1).getReg(); 1208 SrcReg2 = MI.getOperand(2).getReg(); 1209 CmpMask = ~0; 1210 CmpValue = 0; 1211 return true; 1212 case AArch64::SUBSWri: 1213 case AArch64::ADDSWri: 1214 case AArch64::SUBSXri: 1215 case AArch64::ADDSXri: 1216 SrcReg = MI.getOperand(1).getReg(); 1217 SrcReg2 = 0; 1218 CmpMask = ~0; 1219 CmpValue = MI.getOperand(2).getImm(); 1220 return true; 1221 case AArch64::ANDSWri: 1222 case AArch64::ANDSXri: 1223 // ANDS does not use the same encoding scheme as the others xxxS 1224 // instructions. 1225 SrcReg = MI.getOperand(1).getReg(); 1226 SrcReg2 = 0; 1227 CmpMask = ~0; 1228 CmpValue = AArch64_AM::decodeLogicalImmediate( 1229 MI.getOperand(2).getImm(), 1230 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1231 return true; 1232 } 1233 1234 return false; 1235 } 1236 1237 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1238 MachineBasicBlock *MBB = Instr.getParent(); 1239 assert(MBB && "Can't get MachineBasicBlock here"); 1240 MachineFunction *MF = MBB->getParent(); 1241 assert(MF && "Can't get MachineFunction here"); 1242 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1243 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1244 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1245 1246 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1247 ++OpIdx) { 1248 MachineOperand &MO = Instr.getOperand(OpIdx); 1249 const TargetRegisterClass *OpRegCstraints = 1250 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1251 1252 // If there's no constraint, there's nothing to do. 1253 if (!OpRegCstraints) 1254 continue; 1255 // If the operand is a frame index, there's nothing to do here. 1256 // A frame index operand will resolve correctly during PEI. 1257 if (MO.isFI()) 1258 continue; 1259 1260 assert(MO.isReg() && 1261 "Operand has register constraints without being a register!"); 1262 1263 Register Reg = MO.getReg(); 1264 if (Reg.isPhysical()) { 1265 if (!OpRegCstraints->contains(Reg)) 1266 return false; 1267 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1268 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1269 return false; 1270 } 1271 1272 return true; 1273 } 1274 1275 /// Return the opcode that does not set flags when possible - otherwise 1276 /// return the original opcode. The caller is responsible to do the actual 1277 /// substitution and legality checking. 1278 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1279 // Don't convert all compare instructions, because for some the zero register 1280 // encoding becomes the sp register. 1281 bool MIDefinesZeroReg = false; 1282 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) || 1283 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) 1284 MIDefinesZeroReg = true; 1285 1286 switch (MI.getOpcode()) { 1287 default: 1288 return MI.getOpcode(); 1289 case AArch64::ADDSWrr: 1290 return AArch64::ADDWrr; 1291 case AArch64::ADDSWri: 1292 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1293 case AArch64::ADDSWrs: 1294 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1295 case AArch64::ADDSWrx: 1296 return AArch64::ADDWrx; 1297 case AArch64::ADDSXrr: 1298 return AArch64::ADDXrr; 1299 case AArch64::ADDSXri: 1300 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1301 case AArch64::ADDSXrs: 1302 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1303 case AArch64::ADDSXrx: 1304 return AArch64::ADDXrx; 1305 case AArch64::SUBSWrr: 1306 return AArch64::SUBWrr; 1307 case AArch64::SUBSWri: 1308 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1309 case AArch64::SUBSWrs: 1310 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1311 case AArch64::SUBSWrx: 1312 return AArch64::SUBWrx; 1313 case AArch64::SUBSXrr: 1314 return AArch64::SUBXrr; 1315 case AArch64::SUBSXri: 1316 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1317 case AArch64::SUBSXrs: 1318 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1319 case AArch64::SUBSXrx: 1320 return AArch64::SUBXrx; 1321 } 1322 } 1323 1324 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1325 1326 /// True when condition flags are accessed (either by writing or reading) 1327 /// on the instruction trace starting at From and ending at To. 1328 /// 1329 /// Note: If From and To are from different blocks it's assumed CC are accessed 1330 /// on the path. 1331 static bool areCFlagsAccessedBetweenInstrs( 1332 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1333 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1334 // Early exit if To is at the beginning of the BB. 1335 if (To == To->getParent()->begin()) 1336 return true; 1337 1338 // Check whether the instructions are in the same basic block 1339 // If not, assume the condition flags might get modified somewhere. 1340 if (To->getParent() != From->getParent()) 1341 return true; 1342 1343 // From must be above To. 1344 assert(std::any_of( 1345 ++To.getReverse(), To->getParent()->rend(), 1346 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1347 1348 // We iterate backward starting at \p To until we hit \p From. 1349 for (const MachineInstr &Instr : 1350 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1351 if (((AccessToCheck & AK_Write) && 1352 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1353 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1354 return true; 1355 } 1356 return false; 1357 } 1358 1359 std::optional<unsigned> 1360 AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask, 1361 MachineInstr *Pred, 1362 const MachineRegisterInfo *MRI) const { 1363 unsigned MaskOpcode = Mask->getOpcode(); 1364 unsigned PredOpcode = Pred->getOpcode(); 1365 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1366 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1367 1368 if (PredIsWhileLike) { 1369 // For PTEST(PG, PG), PTEST is redundant when PG is the result of a WHILEcc 1370 // instruction and the condition is "any" since WHILcc does an implicit 1371 // PTEST(ALL, PG) check and PG is always a subset of ALL. 1372 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1373 return PredOpcode; 1374 1375 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1376 // redundant since WHILE performs an implicit PTEST with an all active 1377 // mask. 1378 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && 1379 getElementSizeForOpcode(MaskOpcode) == 1380 getElementSizeForOpcode(PredOpcode)) 1381 return PredOpcode; 1382 1383 return {}; 1384 } 1385 1386 if (PredIsPTestLike) { 1387 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1388 // instruction that sets the flags as PTEST would and the condition is 1389 // "any" since PG is always a subset of the governing predicate of the 1390 // ptest-like instruction. 1391 if ((Mask == Pred) && PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1392 return PredOpcode; 1393 1394 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1395 // the element size matches and either the PTEST_LIKE instruction uses 1396 // the same all active mask or the condition is "any". 1397 if (isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31 && 1398 getElementSizeForOpcode(MaskOpcode) == 1399 getElementSizeForOpcode(PredOpcode)) { 1400 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1401 if (Mask == PTestLikeMask || PTest->getOpcode() == AArch64::PTEST_PP_ANY) 1402 return PredOpcode; 1403 } 1404 1405 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1406 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1407 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1408 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1409 // performed by the compare could consider fewer lanes for these element 1410 // sizes. 1411 // 1412 // For example, consider 1413 // 1414 // ptrue p0.b ; P0=1111-1111-1111-1111 1415 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1416 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1417 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1418 // ; ^ last active 1419 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1420 // ; ^ last active 1421 // 1422 // where the compare generates a canonical all active 32-bit predicate 1423 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1424 // active flag, whereas the PTEST instruction with the same mask doesn't. 1425 // For PTEST_ANY this doesn't apply as the flags in this case would be 1426 // identical regardless of element size. 1427 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1428 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1429 if (Mask == PTestLikeMask && (PredElementSize == AArch64::ElementSizeB || 1430 PTest->getOpcode() == AArch64::PTEST_PP_ANY)) 1431 return PredOpcode; 1432 1433 return {}; 1434 } 1435 1436 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1437 // opcode so the PTEST becomes redundant. 1438 switch (PredOpcode) { 1439 case AArch64::AND_PPzPP: 1440 case AArch64::BIC_PPzPP: 1441 case AArch64::EOR_PPzPP: 1442 case AArch64::NAND_PPzPP: 1443 case AArch64::NOR_PPzPP: 1444 case AArch64::ORN_PPzPP: 1445 case AArch64::ORR_PPzPP: 1446 case AArch64::BRKA_PPzP: 1447 case AArch64::BRKPA_PPzPP: 1448 case AArch64::BRKB_PPzP: 1449 case AArch64::BRKPB_PPzPP: 1450 case AArch64::RDFFR_PPz: { 1451 // Check to see if our mask is the same. If not the resulting flag bits 1452 // may be different and we can't remove the ptest. 1453 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1454 if (Mask != PredMask) 1455 return {}; 1456 break; 1457 } 1458 case AArch64::BRKN_PPzP: { 1459 // BRKN uses an all active implicit mask to set flags unlike the other 1460 // flag-setting instructions. 1461 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1462 if ((MaskOpcode != AArch64::PTRUE_B) || 1463 (Mask->getOperand(1).getImm() != 31)) 1464 return {}; 1465 break; 1466 } 1467 case AArch64::PTRUE_B: 1468 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1469 break; 1470 default: 1471 // Bail out if we don't recognize the input 1472 return {}; 1473 } 1474 1475 return convertToFlagSettingOpc(PredOpcode); 1476 } 1477 1478 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1479 /// operation which could set the flags in an identical manner 1480 bool AArch64InstrInfo::optimizePTestInstr( 1481 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1482 const MachineRegisterInfo *MRI) const { 1483 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1484 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1485 unsigned PredOpcode = Pred->getOpcode(); 1486 auto NewOp = canRemovePTestInstr(PTest, Mask, Pred, MRI); 1487 if (!NewOp) 1488 return false; 1489 1490 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1491 1492 // If another instruction between Pred and PTest accesses flags, don't remove 1493 // the ptest or update the earlier instruction to modify them. 1494 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1495 return false; 1496 1497 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1498 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1499 // operand to be replaced with an equivalent instruction that also sets the 1500 // flags. 1501 PTest->eraseFromParent(); 1502 if (*NewOp != PredOpcode) { 1503 Pred->setDesc(get(*NewOp)); 1504 bool succeeded = UpdateOperandRegClass(*Pred); 1505 (void)succeeded; 1506 assert(succeeded && "Operands have incompatible register classes!"); 1507 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1508 } 1509 1510 // Ensure that the flags def is live. 1511 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1512 unsigned i = 0, e = Pred->getNumOperands(); 1513 for (; i != e; ++i) { 1514 MachineOperand &MO = Pred->getOperand(i); 1515 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1516 MO.setIsDead(false); 1517 break; 1518 } 1519 } 1520 } 1521 return true; 1522 } 1523 1524 /// Try to optimize a compare instruction. A compare instruction is an 1525 /// instruction which produces AArch64::NZCV. It can be truly compare 1526 /// instruction 1527 /// when there are no uses of its destination register. 1528 /// 1529 /// The following steps are tried in order: 1530 /// 1. Convert CmpInstr into an unconditional version. 1531 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1532 /// condition code or an instruction which can be converted into such an 1533 /// instruction. 1534 /// Only comparison with zero is supported. 1535 bool AArch64InstrInfo::optimizeCompareInstr( 1536 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1537 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1538 assert(CmpInstr.getParent()); 1539 assert(MRI); 1540 1541 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1542 int DeadNZCVIdx = 1543 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true); 1544 if (DeadNZCVIdx != -1) { 1545 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) || 1546 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) { 1547 CmpInstr.eraseFromParent(); 1548 return true; 1549 } 1550 unsigned Opc = CmpInstr.getOpcode(); 1551 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1552 if (NewOpc == Opc) 1553 return false; 1554 const MCInstrDesc &MCID = get(NewOpc); 1555 CmpInstr.setDesc(MCID); 1556 CmpInstr.removeOperand(DeadNZCVIdx); 1557 bool succeeded = UpdateOperandRegClass(CmpInstr); 1558 (void)succeeded; 1559 assert(succeeded && "Some operands reg class are incompatible!"); 1560 return true; 1561 } 1562 1563 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1564 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1565 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1566 1567 if (SrcReg2 != 0) 1568 return false; 1569 1570 // CmpInstr is a Compare instruction if destination register is not used. 1571 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1572 return false; 1573 1574 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1575 return true; 1576 return (CmpValue == 0 || CmpValue == 1) && 1577 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1578 } 1579 1580 /// Get opcode of S version of Instr. 1581 /// If Instr is S version its opcode is returned. 1582 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1583 /// or we are not interested in it. 1584 static unsigned sForm(MachineInstr &Instr) { 1585 switch (Instr.getOpcode()) { 1586 default: 1587 return AArch64::INSTRUCTION_LIST_END; 1588 1589 case AArch64::ADDSWrr: 1590 case AArch64::ADDSWri: 1591 case AArch64::ADDSXrr: 1592 case AArch64::ADDSXri: 1593 case AArch64::SUBSWrr: 1594 case AArch64::SUBSWri: 1595 case AArch64::SUBSXrr: 1596 case AArch64::SUBSXri: 1597 return Instr.getOpcode(); 1598 1599 case AArch64::ADDWrr: 1600 return AArch64::ADDSWrr; 1601 case AArch64::ADDWri: 1602 return AArch64::ADDSWri; 1603 case AArch64::ADDXrr: 1604 return AArch64::ADDSXrr; 1605 case AArch64::ADDXri: 1606 return AArch64::ADDSXri; 1607 case AArch64::ADCWr: 1608 return AArch64::ADCSWr; 1609 case AArch64::ADCXr: 1610 return AArch64::ADCSXr; 1611 case AArch64::SUBWrr: 1612 return AArch64::SUBSWrr; 1613 case AArch64::SUBWri: 1614 return AArch64::SUBSWri; 1615 case AArch64::SUBXrr: 1616 return AArch64::SUBSXrr; 1617 case AArch64::SUBXri: 1618 return AArch64::SUBSXri; 1619 case AArch64::SBCWr: 1620 return AArch64::SBCSWr; 1621 case AArch64::SBCXr: 1622 return AArch64::SBCSXr; 1623 case AArch64::ANDWri: 1624 return AArch64::ANDSWri; 1625 case AArch64::ANDXri: 1626 return AArch64::ANDSXri; 1627 } 1628 } 1629 1630 /// Check if AArch64::NZCV should be alive in successors of MBB. 1631 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1632 for (auto *BB : MBB->successors()) 1633 if (BB->isLiveIn(AArch64::NZCV)) 1634 return true; 1635 return false; 1636 } 1637 1638 /// \returns The condition code operand index for \p Instr if it is a branch 1639 /// or select and -1 otherwise. 1640 static int 1641 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1642 switch (Instr.getOpcode()) { 1643 default: 1644 return -1; 1645 1646 case AArch64::Bcc: { 1647 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 1648 assert(Idx >= 2); 1649 return Idx - 2; 1650 } 1651 1652 case AArch64::CSINVWr: 1653 case AArch64::CSINVXr: 1654 case AArch64::CSINCWr: 1655 case AArch64::CSINCXr: 1656 case AArch64::CSELWr: 1657 case AArch64::CSELXr: 1658 case AArch64::CSNEGWr: 1659 case AArch64::CSNEGXr: 1660 case AArch64::FCSELSrrr: 1661 case AArch64::FCSELDrrr: { 1662 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); 1663 assert(Idx >= 1); 1664 return Idx - 1; 1665 } 1666 } 1667 } 1668 1669 /// Find a condition code used by the instruction. 1670 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1671 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1672 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1673 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1674 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1675 Instr.getOperand(CCIdx).getImm()) 1676 : AArch64CC::Invalid; 1677 } 1678 1679 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1680 assert(CC != AArch64CC::Invalid); 1681 UsedNZCV UsedFlags; 1682 switch (CC) { 1683 default: 1684 break; 1685 1686 case AArch64CC::EQ: // Z set 1687 case AArch64CC::NE: // Z clear 1688 UsedFlags.Z = true; 1689 break; 1690 1691 case AArch64CC::HI: // Z clear and C set 1692 case AArch64CC::LS: // Z set or C clear 1693 UsedFlags.Z = true; 1694 [[fallthrough]]; 1695 case AArch64CC::HS: // C set 1696 case AArch64CC::LO: // C clear 1697 UsedFlags.C = true; 1698 break; 1699 1700 case AArch64CC::MI: // N set 1701 case AArch64CC::PL: // N clear 1702 UsedFlags.N = true; 1703 break; 1704 1705 case AArch64CC::VS: // V set 1706 case AArch64CC::VC: // V clear 1707 UsedFlags.V = true; 1708 break; 1709 1710 case AArch64CC::GT: // Z clear, N and V the same 1711 case AArch64CC::LE: // Z set, N and V differ 1712 UsedFlags.Z = true; 1713 [[fallthrough]]; 1714 case AArch64CC::GE: // N and V the same 1715 case AArch64CC::LT: // N and V differ 1716 UsedFlags.N = true; 1717 UsedFlags.V = true; 1718 break; 1719 } 1720 return UsedFlags; 1721 } 1722 1723 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1724 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1725 /// \returns std::nullopt otherwise. 1726 /// 1727 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1728 std::optional<UsedNZCV> 1729 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1730 const TargetRegisterInfo &TRI, 1731 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1732 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1733 if (MI.getParent() != CmpParent) 1734 return std::nullopt; 1735 1736 if (areCFlagsAliveInSuccessors(CmpParent)) 1737 return std::nullopt; 1738 1739 UsedNZCV NZCVUsedAfterCmp; 1740 for (MachineInstr &Instr : instructionsWithoutDebug( 1741 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1742 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1743 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1744 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1745 return std::nullopt; 1746 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1747 if (CCUseInstrs) 1748 CCUseInstrs->push_back(&Instr); 1749 } 1750 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1751 break; 1752 } 1753 return NZCVUsedAfterCmp; 1754 } 1755 1756 static bool isADDSRegImm(unsigned Opcode) { 1757 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1758 } 1759 1760 static bool isSUBSRegImm(unsigned Opcode) { 1761 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1762 } 1763 1764 /// Check if CmpInstr can be substituted by MI. 1765 /// 1766 /// CmpInstr can be substituted: 1767 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1768 /// - and, MI and CmpInstr are from the same MachineBB 1769 /// - and, condition flags are not alive in successors of the CmpInstr parent 1770 /// - and, if MI opcode is the S form there must be no defs of flags between 1771 /// MI and CmpInstr 1772 /// or if MI opcode is not the S form there must be neither defs of flags 1773 /// nor uses of flags between MI and CmpInstr. 1774 /// - and, if C/V flags are not used after CmpInstr 1775 /// or if N flag is used but MI produces poison value if signed overflow 1776 /// occurs. 1777 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1778 const TargetRegisterInfo &TRI) { 1779 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1780 // that may or may not set flags. 1781 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1782 1783 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1784 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1785 return false; 1786 1787 assert((CmpInstr.getOperand(2).isImm() && 1788 CmpInstr.getOperand(2).getImm() == 0) && 1789 "Caller guarantees that CmpInstr compares with constant 0"); 1790 1791 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1792 if (!NZVCUsed || NZVCUsed->C) 1793 return false; 1794 1795 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1796 // '%vreg = add ...' or '%vreg = sub ...'. 1797 // Condition flag V is used to indicate signed overflow. 1798 // 1) MI and CmpInstr set N and V to the same value. 1799 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1800 // signed overflow occurs, so CmpInstr could still be simplified away. 1801 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1802 return false; 1803 1804 AccessKind AccessToCheck = AK_Write; 1805 if (sForm(MI) != MI.getOpcode()) 1806 AccessToCheck = AK_All; 1807 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1808 } 1809 1810 /// Substitute an instruction comparing to zero with another instruction 1811 /// which produces needed condition flags. 1812 /// 1813 /// Return true on success. 1814 bool AArch64InstrInfo::substituteCmpToZero( 1815 MachineInstr &CmpInstr, unsigned SrcReg, 1816 const MachineRegisterInfo &MRI) const { 1817 // Get the unique definition of SrcReg. 1818 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1819 if (!MI) 1820 return false; 1821 1822 const TargetRegisterInfo &TRI = getRegisterInfo(); 1823 1824 unsigned NewOpc = sForm(*MI); 1825 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1826 return false; 1827 1828 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1829 return false; 1830 1831 // Update the instruction to set NZCV. 1832 MI->setDesc(get(NewOpc)); 1833 CmpInstr.eraseFromParent(); 1834 bool succeeded = UpdateOperandRegClass(*MI); 1835 (void)succeeded; 1836 assert(succeeded && "Some operands reg class are incompatible!"); 1837 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1838 return true; 1839 } 1840 1841 /// \returns True if \p CmpInstr can be removed. 1842 /// 1843 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1844 /// codes used in \p CCUseInstrs must be inverted. 1845 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1846 int CmpValue, const TargetRegisterInfo &TRI, 1847 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1848 bool &IsInvertCC) { 1849 assert((CmpValue == 0 || CmpValue == 1) && 1850 "Only comparisons to 0 or 1 considered for removal!"); 1851 1852 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1853 unsigned MIOpc = MI.getOpcode(); 1854 if (MIOpc == AArch64::CSINCWr) { 1855 if (MI.getOperand(1).getReg() != AArch64::WZR || 1856 MI.getOperand(2).getReg() != AArch64::WZR) 1857 return false; 1858 } else if (MIOpc == AArch64::CSINCXr) { 1859 if (MI.getOperand(1).getReg() != AArch64::XZR || 1860 MI.getOperand(2).getReg() != AArch64::XZR) 1861 return false; 1862 } else { 1863 return false; 1864 } 1865 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1866 if (MICC == AArch64CC::Invalid) 1867 return false; 1868 1869 // NZCV needs to be defined 1870 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1) 1871 return false; 1872 1873 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1874 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1875 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1876 if (CmpValue && !IsSubsRegImm) 1877 return false; 1878 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1879 return false; 1880 1881 // MI conditions allowed: eq, ne, mi, pl 1882 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1883 if (MIUsedNZCV.C || MIUsedNZCV.V) 1884 return false; 1885 1886 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1887 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1888 // Condition flags are not used in CmpInstr basic block successors and only 1889 // Z or N flags allowed to be used after CmpInstr within its basic block 1890 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1891 return false; 1892 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1893 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1894 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1895 return false; 1896 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1897 if (MIUsedNZCV.N && !CmpValue) 1898 return false; 1899 1900 // There must be no defs of flags between MI and CmpInstr 1901 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1902 return false; 1903 1904 // Condition code is inverted in the following cases: 1905 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1906 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1907 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1908 (!CmpValue && MICC == AArch64CC::NE); 1909 return true; 1910 } 1911 1912 /// Remove comparison in csinc-cmp sequence 1913 /// 1914 /// Examples: 1915 /// 1. \code 1916 /// csinc w9, wzr, wzr, ne 1917 /// cmp w9, #0 1918 /// b.eq 1919 /// \endcode 1920 /// to 1921 /// \code 1922 /// csinc w9, wzr, wzr, ne 1923 /// b.ne 1924 /// \endcode 1925 /// 1926 /// 2. \code 1927 /// csinc x2, xzr, xzr, mi 1928 /// cmp x2, #1 1929 /// b.pl 1930 /// \endcode 1931 /// to 1932 /// \code 1933 /// csinc x2, xzr, xzr, mi 1934 /// b.pl 1935 /// \endcode 1936 /// 1937 /// \param CmpInstr comparison instruction 1938 /// \return True when comparison removed 1939 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1940 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1941 const MachineRegisterInfo &MRI) const { 1942 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1943 if (!MI) 1944 return false; 1945 const TargetRegisterInfo &TRI = getRegisterInfo(); 1946 SmallVector<MachineInstr *, 4> CCUseInstrs; 1947 bool IsInvertCC = false; 1948 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1949 IsInvertCC)) 1950 return false; 1951 // Make transformation 1952 CmpInstr.eraseFromParent(); 1953 if (IsInvertCC) { 1954 // Invert condition codes in CmpInstr CC users 1955 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1956 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1957 assert(Idx >= 0 && "Unexpected instruction using CC."); 1958 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1959 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1960 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1961 CCOperand.setImm(CCUse); 1962 } 1963 } 1964 return true; 1965 } 1966 1967 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1968 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1969 MI.getOpcode() != AArch64::CATCHRET) 1970 return false; 1971 1972 MachineBasicBlock &MBB = *MI.getParent(); 1973 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1974 auto TRI = Subtarget.getRegisterInfo(); 1975 DebugLoc DL = MI.getDebugLoc(); 1976 1977 if (MI.getOpcode() == AArch64::CATCHRET) { 1978 // Skip to the first instruction before the epilog. 1979 const TargetInstrInfo *TII = 1980 MBB.getParent()->getSubtarget().getInstrInfo(); 1981 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1982 auto MBBI = MachineBasicBlock::iterator(MI); 1983 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1984 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1985 FirstEpilogSEH != MBB.begin()) 1986 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1987 if (FirstEpilogSEH != MBB.begin()) 1988 FirstEpilogSEH = std::next(FirstEpilogSEH); 1989 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1990 .addReg(AArch64::X0, RegState::Define) 1991 .addMBB(TargetMBB); 1992 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1993 .addReg(AArch64::X0, RegState::Define) 1994 .addReg(AArch64::X0) 1995 .addMBB(TargetMBB) 1996 .addImm(0); 1997 return true; 1998 } 1999 2000 Register Reg = MI.getOperand(0).getReg(); 2001 Module &M = *MBB.getParent()->getFunction().getParent(); 2002 if (M.getStackProtectorGuard() == "sysreg") { 2003 const AArch64SysReg::SysReg *SrcReg = 2004 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 2005 if (!SrcReg) 2006 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 2007 2008 // mrs xN, sysreg 2009 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 2010 .addDef(Reg, RegState::Renamable) 2011 .addImm(SrcReg->Encoding); 2012 int Offset = M.getStackProtectorGuardOffset(); 2013 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 2014 // ldr xN, [xN, #offset] 2015 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2016 .addDef(Reg) 2017 .addUse(Reg, RegState::Kill) 2018 .addImm(Offset / 8); 2019 } else if (Offset >= -256 && Offset <= 255) { 2020 // ldur xN, [xN, #offset] 2021 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 2022 .addDef(Reg) 2023 .addUse(Reg, RegState::Kill) 2024 .addImm(Offset); 2025 } else if (Offset >= -4095 && Offset <= 4095) { 2026 if (Offset > 0) { 2027 // add xN, xN, #offset 2028 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 2029 .addDef(Reg) 2030 .addUse(Reg, RegState::Kill) 2031 .addImm(Offset) 2032 .addImm(0); 2033 } else { 2034 // sub xN, xN, #offset 2035 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 2036 .addDef(Reg) 2037 .addUse(Reg, RegState::Kill) 2038 .addImm(-Offset) 2039 .addImm(0); 2040 } 2041 // ldr xN, [xN] 2042 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2043 .addDef(Reg) 2044 .addUse(Reg, RegState::Kill) 2045 .addImm(0); 2046 } else { 2047 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 2048 // than 23760. 2049 // It might be nice to use AArch64::MOVi32imm here, which would get 2050 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 2051 // contains the MRS result. findScratchNonCalleeSaveRegister() in 2052 // AArch64FrameLowering might help us find such a scratch register 2053 // though. If we failed to find a scratch register, we could emit a 2054 // stream of add instructions to build up the immediate. Or, we could try 2055 // to insert a AArch64::MOVi32imm before register allocation so that we 2056 // didn't need to scavenge for a scratch register. 2057 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 2058 } 2059 MBB.erase(MI); 2060 return true; 2061 } 2062 2063 const GlobalValue *GV = 2064 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 2065 const TargetMachine &TM = MBB.getParent()->getTarget(); 2066 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 2067 const unsigned char MO_NC = AArch64II::MO_NC; 2068 2069 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2070 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 2071 .addGlobalAddress(GV, 0, OpFlags); 2072 if (Subtarget.isTargetILP32()) { 2073 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2074 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2075 .addDef(Reg32, RegState::Dead) 2076 .addUse(Reg, RegState::Kill) 2077 .addImm(0) 2078 .addMemOperand(*MI.memoperands_begin()) 2079 .addDef(Reg, RegState::Implicit); 2080 } else { 2081 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2082 .addReg(Reg, RegState::Kill) 2083 .addImm(0) 2084 .addMemOperand(*MI.memoperands_begin()); 2085 } 2086 } else if (TM.getCodeModel() == CodeModel::Large) { 2087 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2088 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2089 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2090 .addImm(0); 2091 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2092 .addReg(Reg, RegState::Kill) 2093 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2094 .addImm(16); 2095 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2096 .addReg(Reg, RegState::Kill) 2097 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2098 .addImm(32); 2099 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2100 .addReg(Reg, RegState::Kill) 2101 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2102 .addImm(48); 2103 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2104 .addReg(Reg, RegState::Kill) 2105 .addImm(0) 2106 .addMemOperand(*MI.memoperands_begin()); 2107 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2108 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2109 .addGlobalAddress(GV, 0, OpFlags); 2110 } else { 2111 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2112 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2113 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2114 if (Subtarget.isTargetILP32()) { 2115 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2116 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2117 .addDef(Reg32, RegState::Dead) 2118 .addUse(Reg, RegState::Kill) 2119 .addGlobalAddress(GV, 0, LoFlags) 2120 .addMemOperand(*MI.memoperands_begin()) 2121 .addDef(Reg, RegState::Implicit); 2122 } else { 2123 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2124 .addReg(Reg, RegState::Kill) 2125 .addGlobalAddress(GV, 0, LoFlags) 2126 .addMemOperand(*MI.memoperands_begin()); 2127 } 2128 } 2129 2130 MBB.erase(MI); 2131 2132 return true; 2133 } 2134 2135 // Return true if this instruction simply sets its single destination register 2136 // to zero. This is equivalent to a register rename of the zero-register. 2137 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2138 switch (MI.getOpcode()) { 2139 default: 2140 break; 2141 case AArch64::MOVZWi: 2142 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2143 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2144 assert(MI.getDesc().getNumOperands() == 3 && 2145 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2146 return true; 2147 } 2148 break; 2149 case AArch64::ANDWri: // and Rd, Rzr, #imm 2150 return MI.getOperand(1).getReg() == AArch64::WZR; 2151 case AArch64::ANDXri: 2152 return MI.getOperand(1).getReg() == AArch64::XZR; 2153 case TargetOpcode::COPY: 2154 return MI.getOperand(1).getReg() == AArch64::WZR; 2155 } 2156 return false; 2157 } 2158 2159 // Return true if this instruction simply renames a general register without 2160 // modifying bits. 2161 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2162 switch (MI.getOpcode()) { 2163 default: 2164 break; 2165 case TargetOpcode::COPY: { 2166 // GPR32 copies will by lowered to ORRXrs 2167 Register DstReg = MI.getOperand(0).getReg(); 2168 return (AArch64::GPR32RegClass.contains(DstReg) || 2169 AArch64::GPR64RegClass.contains(DstReg)); 2170 } 2171 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2172 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2173 assert(MI.getDesc().getNumOperands() == 4 && 2174 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2175 return true; 2176 } 2177 break; 2178 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2179 if (MI.getOperand(2).getImm() == 0) { 2180 assert(MI.getDesc().getNumOperands() == 4 && 2181 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2182 return true; 2183 } 2184 break; 2185 } 2186 return false; 2187 } 2188 2189 // Return true if this instruction simply renames a general register without 2190 // modifying bits. 2191 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2192 switch (MI.getOpcode()) { 2193 default: 2194 break; 2195 case TargetOpcode::COPY: { 2196 Register DstReg = MI.getOperand(0).getReg(); 2197 return AArch64::FPR128RegClass.contains(DstReg); 2198 } 2199 case AArch64::ORRv16i8: 2200 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2201 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2202 "invalid ORRv16i8 operands"); 2203 return true; 2204 } 2205 break; 2206 } 2207 return false; 2208 } 2209 2210 Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2211 int &FrameIndex) const { 2212 switch (MI.getOpcode()) { 2213 default: 2214 break; 2215 case AArch64::LDRWui: 2216 case AArch64::LDRXui: 2217 case AArch64::LDRBui: 2218 case AArch64::LDRHui: 2219 case AArch64::LDRSui: 2220 case AArch64::LDRDui: 2221 case AArch64::LDRQui: 2222 case AArch64::LDR_PXI: 2223 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2224 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2225 FrameIndex = MI.getOperand(1).getIndex(); 2226 return MI.getOperand(0).getReg(); 2227 } 2228 break; 2229 } 2230 2231 return 0; 2232 } 2233 2234 Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2235 int &FrameIndex) const { 2236 switch (MI.getOpcode()) { 2237 default: 2238 break; 2239 case AArch64::STRWui: 2240 case AArch64::STRXui: 2241 case AArch64::STRBui: 2242 case AArch64::STRHui: 2243 case AArch64::STRSui: 2244 case AArch64::STRDui: 2245 case AArch64::STRQui: 2246 case AArch64::STR_PXI: 2247 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2248 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2249 FrameIndex = MI.getOperand(1).getIndex(); 2250 return MI.getOperand(0).getReg(); 2251 } 2252 break; 2253 } 2254 return 0; 2255 } 2256 2257 /// Check all MachineMemOperands for a hint to suppress pairing. 2258 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2259 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2260 return MMO->getFlags() & MOSuppressPair; 2261 }); 2262 } 2263 2264 /// Set a flag on the first MachineMemOperand to suppress pairing. 2265 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2266 if (MI.memoperands_empty()) 2267 return; 2268 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2269 } 2270 2271 /// Check all MachineMemOperands for a hint that the load/store is strided. 2272 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2273 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2274 return MMO->getFlags() & MOStridedAccess; 2275 }); 2276 } 2277 2278 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2279 switch (Opc) { 2280 default: 2281 return false; 2282 case AArch64::STURSi: 2283 case AArch64::STRSpre: 2284 case AArch64::STURDi: 2285 case AArch64::STRDpre: 2286 case AArch64::STURQi: 2287 case AArch64::STRQpre: 2288 case AArch64::STURBBi: 2289 case AArch64::STURHHi: 2290 case AArch64::STURWi: 2291 case AArch64::STRWpre: 2292 case AArch64::STURXi: 2293 case AArch64::STRXpre: 2294 case AArch64::LDURSi: 2295 case AArch64::LDRSpre: 2296 case AArch64::LDURDi: 2297 case AArch64::LDRDpre: 2298 case AArch64::LDURQi: 2299 case AArch64::LDRQpre: 2300 case AArch64::LDURWi: 2301 case AArch64::LDRWpre: 2302 case AArch64::LDURXi: 2303 case AArch64::LDRXpre: 2304 case AArch64::LDRSWpre: 2305 case AArch64::LDURSWi: 2306 case AArch64::LDURHHi: 2307 case AArch64::LDURBBi: 2308 case AArch64::LDURSBWi: 2309 case AArch64::LDURSHWi: 2310 return true; 2311 } 2312 } 2313 2314 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2315 switch (Opc) { 2316 default: return {}; 2317 case AArch64::PRFMui: return AArch64::PRFUMi; 2318 case AArch64::LDRXui: return AArch64::LDURXi; 2319 case AArch64::LDRWui: return AArch64::LDURWi; 2320 case AArch64::LDRBui: return AArch64::LDURBi; 2321 case AArch64::LDRHui: return AArch64::LDURHi; 2322 case AArch64::LDRSui: return AArch64::LDURSi; 2323 case AArch64::LDRDui: return AArch64::LDURDi; 2324 case AArch64::LDRQui: return AArch64::LDURQi; 2325 case AArch64::LDRBBui: return AArch64::LDURBBi; 2326 case AArch64::LDRHHui: return AArch64::LDURHHi; 2327 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2328 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2329 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2330 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2331 case AArch64::LDRSWui: return AArch64::LDURSWi; 2332 case AArch64::STRXui: return AArch64::STURXi; 2333 case AArch64::STRWui: return AArch64::STURWi; 2334 case AArch64::STRBui: return AArch64::STURBi; 2335 case AArch64::STRHui: return AArch64::STURHi; 2336 case AArch64::STRSui: return AArch64::STURSi; 2337 case AArch64::STRDui: return AArch64::STURDi; 2338 case AArch64::STRQui: return AArch64::STURQi; 2339 case AArch64::STRBBui: return AArch64::STURBBi; 2340 case AArch64::STRHHui: return AArch64::STURHHi; 2341 } 2342 } 2343 2344 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2345 switch (Opc) { 2346 default: 2347 return 2; 2348 case AArch64::LDPXi: 2349 case AArch64::LDPDi: 2350 case AArch64::STPXi: 2351 case AArch64::STPDi: 2352 case AArch64::LDNPXi: 2353 case AArch64::LDNPDi: 2354 case AArch64::STNPXi: 2355 case AArch64::STNPDi: 2356 case AArch64::LDPQi: 2357 case AArch64::STPQi: 2358 case AArch64::LDNPQi: 2359 case AArch64::STNPQi: 2360 case AArch64::LDPWi: 2361 case AArch64::LDPSi: 2362 case AArch64::STPWi: 2363 case AArch64::STPSi: 2364 case AArch64::LDNPWi: 2365 case AArch64::LDNPSi: 2366 case AArch64::STNPWi: 2367 case AArch64::STNPSi: 2368 case AArch64::LDG: 2369 case AArch64::STGPi: 2370 2371 case AArch64::LD1B_IMM: 2372 case AArch64::LD1B_H_IMM: 2373 case AArch64::LD1B_S_IMM: 2374 case AArch64::LD1B_D_IMM: 2375 case AArch64::LD1SB_H_IMM: 2376 case AArch64::LD1SB_S_IMM: 2377 case AArch64::LD1SB_D_IMM: 2378 case AArch64::LD1H_IMM: 2379 case AArch64::LD1H_S_IMM: 2380 case AArch64::LD1H_D_IMM: 2381 case AArch64::LD1SH_S_IMM: 2382 case AArch64::LD1SH_D_IMM: 2383 case AArch64::LD1W_IMM: 2384 case AArch64::LD1W_D_IMM: 2385 case AArch64::LD1SW_D_IMM: 2386 case AArch64::LD1D_IMM: 2387 2388 case AArch64::LD2B_IMM: 2389 case AArch64::LD2H_IMM: 2390 case AArch64::LD2W_IMM: 2391 case AArch64::LD2D_IMM: 2392 case AArch64::LD3B_IMM: 2393 case AArch64::LD3H_IMM: 2394 case AArch64::LD3W_IMM: 2395 case AArch64::LD3D_IMM: 2396 case AArch64::LD4B_IMM: 2397 case AArch64::LD4H_IMM: 2398 case AArch64::LD4W_IMM: 2399 case AArch64::LD4D_IMM: 2400 2401 case AArch64::ST1B_IMM: 2402 case AArch64::ST1B_H_IMM: 2403 case AArch64::ST1B_S_IMM: 2404 case AArch64::ST1B_D_IMM: 2405 case AArch64::ST1H_IMM: 2406 case AArch64::ST1H_S_IMM: 2407 case AArch64::ST1H_D_IMM: 2408 case AArch64::ST1W_IMM: 2409 case AArch64::ST1W_D_IMM: 2410 case AArch64::ST1D_IMM: 2411 2412 case AArch64::ST2B_IMM: 2413 case AArch64::ST2H_IMM: 2414 case AArch64::ST2W_IMM: 2415 case AArch64::ST2D_IMM: 2416 case AArch64::ST3B_IMM: 2417 case AArch64::ST3H_IMM: 2418 case AArch64::ST3W_IMM: 2419 case AArch64::ST3D_IMM: 2420 case AArch64::ST4B_IMM: 2421 case AArch64::ST4H_IMM: 2422 case AArch64::ST4W_IMM: 2423 case AArch64::ST4D_IMM: 2424 2425 case AArch64::LD1RB_IMM: 2426 case AArch64::LD1RB_H_IMM: 2427 case AArch64::LD1RB_S_IMM: 2428 case AArch64::LD1RB_D_IMM: 2429 case AArch64::LD1RSB_H_IMM: 2430 case AArch64::LD1RSB_S_IMM: 2431 case AArch64::LD1RSB_D_IMM: 2432 case AArch64::LD1RH_IMM: 2433 case AArch64::LD1RH_S_IMM: 2434 case AArch64::LD1RH_D_IMM: 2435 case AArch64::LD1RSH_S_IMM: 2436 case AArch64::LD1RSH_D_IMM: 2437 case AArch64::LD1RW_IMM: 2438 case AArch64::LD1RW_D_IMM: 2439 case AArch64::LD1RSW_IMM: 2440 case AArch64::LD1RD_IMM: 2441 2442 case AArch64::LDNT1B_ZRI: 2443 case AArch64::LDNT1H_ZRI: 2444 case AArch64::LDNT1W_ZRI: 2445 case AArch64::LDNT1D_ZRI: 2446 case AArch64::STNT1B_ZRI: 2447 case AArch64::STNT1H_ZRI: 2448 case AArch64::STNT1W_ZRI: 2449 case AArch64::STNT1D_ZRI: 2450 2451 case AArch64::LDNF1B_IMM: 2452 case AArch64::LDNF1B_H_IMM: 2453 case AArch64::LDNF1B_S_IMM: 2454 case AArch64::LDNF1B_D_IMM: 2455 case AArch64::LDNF1SB_H_IMM: 2456 case AArch64::LDNF1SB_S_IMM: 2457 case AArch64::LDNF1SB_D_IMM: 2458 case AArch64::LDNF1H_IMM: 2459 case AArch64::LDNF1H_S_IMM: 2460 case AArch64::LDNF1H_D_IMM: 2461 case AArch64::LDNF1SH_S_IMM: 2462 case AArch64::LDNF1SH_D_IMM: 2463 case AArch64::LDNF1W_IMM: 2464 case AArch64::LDNF1W_D_IMM: 2465 case AArch64::LDNF1SW_D_IMM: 2466 case AArch64::LDNF1D_IMM: 2467 return 3; 2468 case AArch64::ADDG: 2469 case AArch64::STGi: 2470 case AArch64::LDR_PXI: 2471 case AArch64::STR_PXI: 2472 return 2; 2473 } 2474 } 2475 2476 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2477 switch (MI.getOpcode()) { 2478 default: 2479 return false; 2480 // Scaled instructions. 2481 case AArch64::STRSui: 2482 case AArch64::STRDui: 2483 case AArch64::STRQui: 2484 case AArch64::STRXui: 2485 case AArch64::STRWui: 2486 case AArch64::LDRSui: 2487 case AArch64::LDRDui: 2488 case AArch64::LDRQui: 2489 case AArch64::LDRXui: 2490 case AArch64::LDRWui: 2491 case AArch64::LDRSWui: 2492 // Unscaled instructions. 2493 case AArch64::STURSi: 2494 case AArch64::STRSpre: 2495 case AArch64::STURDi: 2496 case AArch64::STRDpre: 2497 case AArch64::STURQi: 2498 case AArch64::STRQpre: 2499 case AArch64::STURWi: 2500 case AArch64::STRWpre: 2501 case AArch64::STURXi: 2502 case AArch64::STRXpre: 2503 case AArch64::LDURSi: 2504 case AArch64::LDRSpre: 2505 case AArch64::LDURDi: 2506 case AArch64::LDRDpre: 2507 case AArch64::LDURQi: 2508 case AArch64::LDRQpre: 2509 case AArch64::LDURWi: 2510 case AArch64::LDRWpre: 2511 case AArch64::LDURXi: 2512 case AArch64::LDRXpre: 2513 case AArch64::LDURSWi: 2514 case AArch64::LDRSWpre: 2515 return true; 2516 } 2517 } 2518 2519 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) { 2520 switch (MI.getOpcode()) { 2521 default: 2522 assert((!MI.isCall() || !MI.isReturn()) && 2523 "Unexpected instruction - was a new tail call opcode introduced?"); 2524 return false; 2525 case AArch64::TCRETURNdi: 2526 case AArch64::TCRETURNri: 2527 case AArch64::TCRETURNrix16x17: 2528 case AArch64::TCRETURNrix17: 2529 case AArch64::TCRETURNrinotx16: 2530 case AArch64::TCRETURNriALL: 2531 case AArch64::AUTH_TCRETURN: 2532 case AArch64::AUTH_TCRETURN_BTI: 2533 return true; 2534 } 2535 } 2536 2537 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2538 switch (Opc) { 2539 default: 2540 llvm_unreachable("Opcode has no flag setting equivalent!"); 2541 // 32-bit cases: 2542 case AArch64::ADDWri: 2543 return AArch64::ADDSWri; 2544 case AArch64::ADDWrr: 2545 return AArch64::ADDSWrr; 2546 case AArch64::ADDWrs: 2547 return AArch64::ADDSWrs; 2548 case AArch64::ADDWrx: 2549 return AArch64::ADDSWrx; 2550 case AArch64::ANDWri: 2551 return AArch64::ANDSWri; 2552 case AArch64::ANDWrr: 2553 return AArch64::ANDSWrr; 2554 case AArch64::ANDWrs: 2555 return AArch64::ANDSWrs; 2556 case AArch64::BICWrr: 2557 return AArch64::BICSWrr; 2558 case AArch64::BICWrs: 2559 return AArch64::BICSWrs; 2560 case AArch64::SUBWri: 2561 return AArch64::SUBSWri; 2562 case AArch64::SUBWrr: 2563 return AArch64::SUBSWrr; 2564 case AArch64::SUBWrs: 2565 return AArch64::SUBSWrs; 2566 case AArch64::SUBWrx: 2567 return AArch64::SUBSWrx; 2568 // 64-bit cases: 2569 case AArch64::ADDXri: 2570 return AArch64::ADDSXri; 2571 case AArch64::ADDXrr: 2572 return AArch64::ADDSXrr; 2573 case AArch64::ADDXrs: 2574 return AArch64::ADDSXrs; 2575 case AArch64::ADDXrx: 2576 return AArch64::ADDSXrx; 2577 case AArch64::ANDXri: 2578 return AArch64::ANDSXri; 2579 case AArch64::ANDXrr: 2580 return AArch64::ANDSXrr; 2581 case AArch64::ANDXrs: 2582 return AArch64::ANDSXrs; 2583 case AArch64::BICXrr: 2584 return AArch64::BICSXrr; 2585 case AArch64::BICXrs: 2586 return AArch64::BICSXrs; 2587 case AArch64::SUBXri: 2588 return AArch64::SUBSXri; 2589 case AArch64::SUBXrr: 2590 return AArch64::SUBSXrr; 2591 case AArch64::SUBXrs: 2592 return AArch64::SUBSXrs; 2593 case AArch64::SUBXrx: 2594 return AArch64::SUBSXrx; 2595 // SVE instructions: 2596 case AArch64::AND_PPzPP: 2597 return AArch64::ANDS_PPzPP; 2598 case AArch64::BIC_PPzPP: 2599 return AArch64::BICS_PPzPP; 2600 case AArch64::EOR_PPzPP: 2601 return AArch64::EORS_PPzPP; 2602 case AArch64::NAND_PPzPP: 2603 return AArch64::NANDS_PPzPP; 2604 case AArch64::NOR_PPzPP: 2605 return AArch64::NORS_PPzPP; 2606 case AArch64::ORN_PPzPP: 2607 return AArch64::ORNS_PPzPP; 2608 case AArch64::ORR_PPzPP: 2609 return AArch64::ORRS_PPzPP; 2610 case AArch64::BRKA_PPzP: 2611 return AArch64::BRKAS_PPzP; 2612 case AArch64::BRKPA_PPzPP: 2613 return AArch64::BRKPAS_PPzPP; 2614 case AArch64::BRKB_PPzP: 2615 return AArch64::BRKBS_PPzP; 2616 case AArch64::BRKPB_PPzPP: 2617 return AArch64::BRKPBS_PPzPP; 2618 case AArch64::BRKN_PPzP: 2619 return AArch64::BRKNS_PPzP; 2620 case AArch64::RDFFR_PPz: 2621 return AArch64::RDFFRS_PPz; 2622 case AArch64::PTRUE_B: 2623 return AArch64::PTRUES_B; 2624 } 2625 } 2626 2627 // Is this a candidate for ld/st merging or pairing? For example, we don't 2628 // touch volatiles or load/stores that have a hint to avoid pair formation. 2629 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2630 2631 bool IsPreLdSt = isPreLdSt(MI); 2632 2633 // If this is a volatile load/store, don't mess with it. 2634 if (MI.hasOrderedMemoryRef()) 2635 return false; 2636 2637 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2638 // For Pre-inc LD/ST, the operand is shifted by one. 2639 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2640 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2641 "Expected a reg or frame index operand."); 2642 2643 // For Pre-indexed addressing quadword instructions, the third operand is the 2644 // immediate value. 2645 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2646 2647 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2648 return false; 2649 2650 // Can't merge/pair if the instruction modifies the base register. 2651 // e.g., ldr x0, [x0] 2652 // This case will never occur with an FI base. 2653 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or 2654 // STR<S,D,Q,W,X>pre, it can be merged. 2655 // For example: 2656 // ldr q0, [x11, #32]! 2657 // ldr q1, [x11, #16] 2658 // to 2659 // ldp q0, q1, [x11, #32]! 2660 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2661 Register BaseReg = MI.getOperand(1).getReg(); 2662 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2663 if (MI.modifiesRegister(BaseReg, TRI)) 2664 return false; 2665 } 2666 2667 // Check if this load/store has a hint to avoid pair formation. 2668 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2669 if (isLdStPairSuppressed(MI)) 2670 return false; 2671 2672 // Do not pair any callee-save store/reload instructions in the 2673 // prologue/epilogue if the CFI information encoded the operations as separate 2674 // instructions, as that will cause the size of the actual prologue to mismatch 2675 // with the prologue size recorded in the Windows CFI. 2676 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2677 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2678 MI.getMF()->getFunction().needsUnwindTableEntry(); 2679 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2680 MI.getFlag(MachineInstr::FrameDestroy))) 2681 return false; 2682 2683 // On some CPUs quad load/store pairs are slower than two single load/stores. 2684 if (Subtarget.isPaired128Slow()) { 2685 switch (MI.getOpcode()) { 2686 default: 2687 break; 2688 case AArch64::LDURQi: 2689 case AArch64::STURQi: 2690 case AArch64::LDRQui: 2691 case AArch64::STRQui: 2692 return false; 2693 } 2694 } 2695 2696 return true; 2697 } 2698 2699 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2700 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2701 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, 2702 const TargetRegisterInfo *TRI) const { 2703 if (!LdSt.mayLoadOrStore()) 2704 return false; 2705 2706 const MachineOperand *BaseOp; 2707 TypeSize WidthN(0, false); 2708 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2709 WidthN, TRI)) 2710 return false; 2711 // The maximum vscale is 16 under AArch64, return the maximal extent for the 2712 // vector. 2713 Width = LocationSize::precise(WidthN); 2714 BaseOps.push_back(BaseOp); 2715 return true; 2716 } 2717 2718 std::optional<ExtAddrMode> 2719 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2720 const TargetRegisterInfo *TRI) const { 2721 const MachineOperand *Base; // Filled with the base operand of MI. 2722 int64_t Offset; // Filled with the offset of MI. 2723 bool OffsetIsScalable; 2724 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2725 return std::nullopt; 2726 2727 if (!Base->isReg()) 2728 return std::nullopt; 2729 ExtAddrMode AM; 2730 AM.BaseReg = Base->getReg(); 2731 AM.Displacement = Offset; 2732 AM.ScaledReg = 0; 2733 AM.Scale = 0; 2734 return AM; 2735 } 2736 2737 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, 2738 Register Reg, 2739 const MachineInstr &AddrI, 2740 ExtAddrMode &AM) const { 2741 // Filter out instructions into which we cannot fold. 2742 unsigned NumBytes; 2743 int64_t OffsetScale = 1; 2744 switch (MemI.getOpcode()) { 2745 default: 2746 return false; 2747 2748 case AArch64::LDURQi: 2749 case AArch64::STURQi: 2750 NumBytes = 16; 2751 break; 2752 2753 case AArch64::LDURDi: 2754 case AArch64::STURDi: 2755 case AArch64::LDURXi: 2756 case AArch64::STURXi: 2757 NumBytes = 8; 2758 break; 2759 2760 case AArch64::LDURWi: 2761 case AArch64::LDURSWi: 2762 case AArch64::STURWi: 2763 NumBytes = 4; 2764 break; 2765 2766 case AArch64::LDURHi: 2767 case AArch64::STURHi: 2768 case AArch64::LDURHHi: 2769 case AArch64::STURHHi: 2770 case AArch64::LDURSHXi: 2771 case AArch64::LDURSHWi: 2772 NumBytes = 2; 2773 break; 2774 2775 case AArch64::LDRBroX: 2776 case AArch64::LDRBBroX: 2777 case AArch64::LDRSBXroX: 2778 case AArch64::LDRSBWroX: 2779 case AArch64::STRBroX: 2780 case AArch64::STRBBroX: 2781 case AArch64::LDURBi: 2782 case AArch64::LDURBBi: 2783 case AArch64::LDURSBXi: 2784 case AArch64::LDURSBWi: 2785 case AArch64::STURBi: 2786 case AArch64::STURBBi: 2787 case AArch64::LDRBui: 2788 case AArch64::LDRBBui: 2789 case AArch64::LDRSBXui: 2790 case AArch64::LDRSBWui: 2791 case AArch64::STRBui: 2792 case AArch64::STRBBui: 2793 NumBytes = 1; 2794 break; 2795 2796 case AArch64::LDRQroX: 2797 case AArch64::STRQroX: 2798 case AArch64::LDRQui: 2799 case AArch64::STRQui: 2800 NumBytes = 16; 2801 OffsetScale = 16; 2802 break; 2803 2804 case AArch64::LDRDroX: 2805 case AArch64::STRDroX: 2806 case AArch64::LDRXroX: 2807 case AArch64::STRXroX: 2808 case AArch64::LDRDui: 2809 case AArch64::STRDui: 2810 case AArch64::LDRXui: 2811 case AArch64::STRXui: 2812 NumBytes = 8; 2813 OffsetScale = 8; 2814 break; 2815 2816 case AArch64::LDRWroX: 2817 case AArch64::LDRSWroX: 2818 case AArch64::STRWroX: 2819 case AArch64::LDRWui: 2820 case AArch64::LDRSWui: 2821 case AArch64::STRWui: 2822 NumBytes = 4; 2823 OffsetScale = 4; 2824 break; 2825 2826 case AArch64::LDRHroX: 2827 case AArch64::STRHroX: 2828 case AArch64::LDRHHroX: 2829 case AArch64::STRHHroX: 2830 case AArch64::LDRSHXroX: 2831 case AArch64::LDRSHWroX: 2832 case AArch64::LDRHui: 2833 case AArch64::STRHui: 2834 case AArch64::LDRHHui: 2835 case AArch64::STRHHui: 2836 case AArch64::LDRSHXui: 2837 case AArch64::LDRSHWui: 2838 NumBytes = 2; 2839 OffsetScale = 2; 2840 break; 2841 } 2842 2843 // Check the fold operand is not the loaded/stored value. 2844 const MachineOperand &BaseRegOp = MemI.getOperand(0); 2845 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg) 2846 return false; 2847 2848 // Handle memory instructions with a [Reg, Reg] addressing mode. 2849 if (MemI.getOperand(2).isReg()) { 2850 // Bail if the addressing mode already includes extension of the offset 2851 // register. 2852 if (MemI.getOperand(3).getImm()) 2853 return false; 2854 2855 // Check if we actually have a scaled offset. 2856 if (MemI.getOperand(4).getImm() == 0) 2857 OffsetScale = 1; 2858 2859 // If the address instructions is folded into the base register, then the 2860 // addressing mode must not have a scale. Then we can swap the base and the 2861 // scaled registers. 2862 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1) 2863 return false; 2864 2865 switch (AddrI.getOpcode()) { 2866 default: 2867 return false; 2868 2869 case AArch64::SBFMXri: 2870 // sxtw Xa, Wm 2871 // ldr Xd, [Xn, Xa, lsl #N] 2872 // -> 2873 // ldr Xd, [Xn, Wm, sxtw #N] 2874 if (AddrI.getOperand(2).getImm() != 0 || 2875 AddrI.getOperand(3).getImm() != 31) 2876 return false; 2877 2878 AM.BaseReg = MemI.getOperand(1).getReg(); 2879 if (AM.BaseReg == Reg) 2880 AM.BaseReg = MemI.getOperand(2).getReg(); 2881 AM.ScaledReg = AddrI.getOperand(1).getReg(); 2882 AM.Scale = OffsetScale; 2883 AM.Displacement = 0; 2884 AM.Form = ExtAddrMode::Formula::SExtScaledReg; 2885 return true; 2886 2887 case TargetOpcode::SUBREG_TO_REG: { 2888 // mov Wa, Wm 2889 // ldr Xd, [Xn, Xa, lsl #N] 2890 // -> 2891 // ldr Xd, [Xn, Wm, uxtw #N] 2892 2893 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG. 2894 if (AddrI.getOperand(1).getImm() != 0 || 2895 AddrI.getOperand(3).getImm() != AArch64::sub_32) 2896 return false; 2897 2898 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo(); 2899 Register OffsetReg = AddrI.getOperand(2).getReg(); 2900 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg)) 2901 return false; 2902 2903 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg); 2904 if (DefMI.getOpcode() != AArch64::ORRWrs || 2905 DefMI.getOperand(1).getReg() != AArch64::WZR || 2906 DefMI.getOperand(3).getImm() != 0) 2907 return false; 2908 2909 AM.BaseReg = MemI.getOperand(1).getReg(); 2910 if (AM.BaseReg == Reg) 2911 AM.BaseReg = MemI.getOperand(2).getReg(); 2912 AM.ScaledReg = DefMI.getOperand(2).getReg(); 2913 AM.Scale = OffsetScale; 2914 AM.Displacement = 0; 2915 AM.Form = ExtAddrMode::Formula::ZExtScaledReg; 2916 return true; 2917 } 2918 } 2919 } 2920 2921 // Handle memory instructions with a [Reg, #Imm] addressing mode. 2922 2923 // Check we are not breaking a potential conversion to an LDP. 2924 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset, 2925 int64_t NewOffset) -> bool { 2926 int64_t MinOffset, MaxOffset; 2927 switch (NumBytes) { 2928 default: 2929 return true; 2930 case 4: 2931 MinOffset = -256; 2932 MaxOffset = 252; 2933 break; 2934 case 8: 2935 MinOffset = -512; 2936 MaxOffset = 504; 2937 break; 2938 case 16: 2939 MinOffset = -1024; 2940 MaxOffset = 1008; 2941 break; 2942 } 2943 return OldOffset < MinOffset || OldOffset > MaxOffset || 2944 (NewOffset >= MinOffset && NewOffset <= MaxOffset); 2945 }; 2946 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool { 2947 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale; 2948 int64_t NewOffset = OldOffset + Disp; 2949 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0)) 2950 return false; 2951 // If the old offset would fit into an LDP, but the new offset wouldn't, 2952 // bail out. 2953 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset)) 2954 return false; 2955 AM.BaseReg = AddrI.getOperand(1).getReg(); 2956 AM.ScaledReg = 0; 2957 AM.Scale = 0; 2958 AM.Displacement = NewOffset; 2959 AM.Form = ExtAddrMode::Formula::Basic; 2960 return true; 2961 }; 2962 2963 auto canFoldAddRegIntoAddrMode = 2964 [&](int64_t Scale, 2965 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool { 2966 if (MemI.getOperand(2).getImm() != 0) 2967 return false; 2968 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale)) 2969 return false; 2970 AM.BaseReg = AddrI.getOperand(1).getReg(); 2971 AM.ScaledReg = AddrI.getOperand(2).getReg(); 2972 AM.Scale = Scale; 2973 AM.Displacement = 0; 2974 AM.Form = Form; 2975 return true; 2976 }; 2977 2978 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) { 2979 unsigned Opcode = MemI.getOpcode(); 2980 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) && 2981 Subtarget.isSTRQroSlow(); 2982 }; 2983 2984 int64_t Disp = 0; 2985 const bool OptSize = MemI.getMF()->getFunction().hasOptSize(); 2986 switch (AddrI.getOpcode()) { 2987 default: 2988 return false; 2989 2990 case AArch64::ADDXri: 2991 // add Xa, Xn, #N 2992 // ldr Xd, [Xa, #M] 2993 // -> 2994 // ldr Xd, [Xn, #N'+M] 2995 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 2996 return canFoldAddSubImmIntoAddrMode(Disp); 2997 2998 case AArch64::SUBXri: 2999 // sub Xa, Xn, #N 3000 // ldr Xd, [Xa, #M] 3001 // -> 3002 // ldr Xd, [Xn, #N'+M] 3003 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 3004 return canFoldAddSubImmIntoAddrMode(-Disp); 3005 3006 case AArch64::ADDXrs: { 3007 // add Xa, Xn, Xm, lsl #N 3008 // ldr Xd, [Xa] 3009 // -> 3010 // ldr Xd, [Xn, Xm, lsl #N] 3011 3012 // Don't fold the add if the result would be slower, unless optimising for 3013 // size. 3014 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3015 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL) 3016 return false; 3017 Shift = AArch64_AM::getShiftValue(Shift); 3018 if (!OptSize) { 3019 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14()) 3020 return false; 3021 if (avoidSlowSTRQ(MemI)) 3022 return false; 3023 } 3024 return canFoldAddRegIntoAddrMode(1ULL << Shift); 3025 } 3026 3027 case AArch64::ADDXrr: 3028 // add Xa, Xn, Xm 3029 // ldr Xd, [Xa] 3030 // -> 3031 // ldr Xd, [Xn, Xm, lsl #0] 3032 3033 // Don't fold the add if the result would be slower, unless optimising for 3034 // size. 3035 if (!OptSize && avoidSlowSTRQ(MemI)) 3036 return false; 3037 return canFoldAddRegIntoAddrMode(1); 3038 3039 case AArch64::ADDXrx: 3040 // add Xa, Xn, Wm, {s,u}xtw #N 3041 // ldr Xd, [Xa] 3042 // -> 3043 // ldr Xd, [Xn, Wm, {s,u}xtw #N] 3044 3045 // Don't fold the add if the result would be slower, unless optimising for 3046 // size. 3047 if (!OptSize && avoidSlowSTRQ(MemI)) 3048 return false; 3049 3050 // Can fold only sign-/zero-extend of a word. 3051 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3052 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm); 3053 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW) 3054 return false; 3055 3056 return canFoldAddRegIntoAddrMode( 3057 1ULL << AArch64_AM::getArithShiftValue(Imm), 3058 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg 3059 : ExtAddrMode::Formula::ZExtScaledReg); 3060 } 3061 } 3062 3063 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, 3064 // return the opcode of an instruction performing the same operation, but using 3065 // the [Reg, Reg] addressing mode. 3066 static unsigned regOffsetOpcode(unsigned Opcode) { 3067 switch (Opcode) { 3068 default: 3069 llvm_unreachable("Address folding not implemented for instruction"); 3070 3071 case AArch64::LDURQi: 3072 case AArch64::LDRQui: 3073 return AArch64::LDRQroX; 3074 case AArch64::STURQi: 3075 case AArch64::STRQui: 3076 return AArch64::STRQroX; 3077 case AArch64::LDURDi: 3078 case AArch64::LDRDui: 3079 return AArch64::LDRDroX; 3080 case AArch64::STURDi: 3081 case AArch64::STRDui: 3082 return AArch64::STRDroX; 3083 case AArch64::LDURXi: 3084 case AArch64::LDRXui: 3085 return AArch64::LDRXroX; 3086 case AArch64::STURXi: 3087 case AArch64::STRXui: 3088 return AArch64::STRXroX; 3089 case AArch64::LDURWi: 3090 case AArch64::LDRWui: 3091 return AArch64::LDRWroX; 3092 case AArch64::LDURSWi: 3093 case AArch64::LDRSWui: 3094 return AArch64::LDRSWroX; 3095 case AArch64::STURWi: 3096 case AArch64::STRWui: 3097 return AArch64::STRWroX; 3098 case AArch64::LDURHi: 3099 case AArch64::LDRHui: 3100 return AArch64::LDRHroX; 3101 case AArch64::STURHi: 3102 case AArch64::STRHui: 3103 return AArch64::STRHroX; 3104 case AArch64::LDURHHi: 3105 case AArch64::LDRHHui: 3106 return AArch64::LDRHHroX; 3107 case AArch64::STURHHi: 3108 case AArch64::STRHHui: 3109 return AArch64::STRHHroX; 3110 case AArch64::LDURSHXi: 3111 case AArch64::LDRSHXui: 3112 return AArch64::LDRSHXroX; 3113 case AArch64::LDURSHWi: 3114 case AArch64::LDRSHWui: 3115 return AArch64::LDRSHWroX; 3116 case AArch64::LDURBi: 3117 case AArch64::LDRBui: 3118 return AArch64::LDRBroX; 3119 case AArch64::LDURBBi: 3120 case AArch64::LDRBBui: 3121 return AArch64::LDRBBroX; 3122 case AArch64::LDURSBXi: 3123 case AArch64::LDRSBXui: 3124 return AArch64::LDRSBXroX; 3125 case AArch64::LDURSBWi: 3126 case AArch64::LDRSBWui: 3127 return AArch64::LDRSBWroX; 3128 case AArch64::STURBi: 3129 case AArch64::STRBui: 3130 return AArch64::STRBroX; 3131 case AArch64::STURBBi: 3132 case AArch64::STRBBui: 3133 return AArch64::STRBBroX; 3134 } 3135 } 3136 3137 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3138 // the opcode of an instruction performing the same operation, but using the 3139 // [Reg, #Imm] addressing mode with scaled offset. 3140 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) { 3141 switch (Opcode) { 3142 default: 3143 llvm_unreachable("Address folding not implemented for instruction"); 3144 3145 case AArch64::LDURQi: 3146 Scale = 16; 3147 return AArch64::LDRQui; 3148 case AArch64::STURQi: 3149 Scale = 16; 3150 return AArch64::STRQui; 3151 case AArch64::LDURDi: 3152 Scale = 8; 3153 return AArch64::LDRDui; 3154 case AArch64::STURDi: 3155 Scale = 8; 3156 return AArch64::STRDui; 3157 case AArch64::LDURXi: 3158 Scale = 8; 3159 return AArch64::LDRXui; 3160 case AArch64::STURXi: 3161 Scale = 8; 3162 return AArch64::STRXui; 3163 case AArch64::LDURWi: 3164 Scale = 4; 3165 return AArch64::LDRWui; 3166 case AArch64::LDURSWi: 3167 Scale = 4; 3168 return AArch64::LDRSWui; 3169 case AArch64::STURWi: 3170 Scale = 4; 3171 return AArch64::STRWui; 3172 case AArch64::LDURHi: 3173 Scale = 2; 3174 return AArch64::LDRHui; 3175 case AArch64::STURHi: 3176 Scale = 2; 3177 return AArch64::STRHui; 3178 case AArch64::LDURHHi: 3179 Scale = 2; 3180 return AArch64::LDRHHui; 3181 case AArch64::STURHHi: 3182 Scale = 2; 3183 return AArch64::STRHHui; 3184 case AArch64::LDURSHXi: 3185 Scale = 2; 3186 return AArch64::LDRSHXui; 3187 case AArch64::LDURSHWi: 3188 Scale = 2; 3189 return AArch64::LDRSHWui; 3190 case AArch64::LDURBi: 3191 Scale = 1; 3192 return AArch64::LDRBui; 3193 case AArch64::LDURBBi: 3194 Scale = 1; 3195 return AArch64::LDRBBui; 3196 case AArch64::LDURSBXi: 3197 Scale = 1; 3198 return AArch64::LDRSBXui; 3199 case AArch64::LDURSBWi: 3200 Scale = 1; 3201 return AArch64::LDRSBWui; 3202 case AArch64::STURBi: 3203 Scale = 1; 3204 return AArch64::STRBui; 3205 case AArch64::STURBBi: 3206 Scale = 1; 3207 return AArch64::STRBBui; 3208 case AArch64::LDRQui: 3209 case AArch64::STRQui: 3210 Scale = 16; 3211 return Opcode; 3212 case AArch64::LDRDui: 3213 case AArch64::STRDui: 3214 case AArch64::LDRXui: 3215 case AArch64::STRXui: 3216 Scale = 8; 3217 return Opcode; 3218 case AArch64::LDRWui: 3219 case AArch64::LDRSWui: 3220 case AArch64::STRWui: 3221 Scale = 4; 3222 return Opcode; 3223 case AArch64::LDRHui: 3224 case AArch64::STRHui: 3225 case AArch64::LDRHHui: 3226 case AArch64::STRHHui: 3227 case AArch64::LDRSHXui: 3228 case AArch64::LDRSHWui: 3229 Scale = 2; 3230 return Opcode; 3231 case AArch64::LDRBui: 3232 case AArch64::LDRBBui: 3233 case AArch64::LDRSBXui: 3234 case AArch64::LDRSBWui: 3235 case AArch64::STRBui: 3236 case AArch64::STRBBui: 3237 Scale = 1; 3238 return Opcode; 3239 } 3240 } 3241 3242 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3243 // the opcode of an instruction performing the same operation, but using the 3244 // [Reg, #Imm] addressing mode with unscaled offset. 3245 unsigned unscaledOffsetOpcode(unsigned Opcode) { 3246 switch (Opcode) { 3247 default: 3248 llvm_unreachable("Address folding not implemented for instruction"); 3249 3250 case AArch64::LDURQi: 3251 case AArch64::STURQi: 3252 case AArch64::LDURDi: 3253 case AArch64::STURDi: 3254 case AArch64::LDURXi: 3255 case AArch64::STURXi: 3256 case AArch64::LDURWi: 3257 case AArch64::LDURSWi: 3258 case AArch64::STURWi: 3259 case AArch64::LDURHi: 3260 case AArch64::STURHi: 3261 case AArch64::LDURHHi: 3262 case AArch64::STURHHi: 3263 case AArch64::LDURSHXi: 3264 case AArch64::LDURSHWi: 3265 case AArch64::LDURBi: 3266 case AArch64::STURBi: 3267 case AArch64::LDURBBi: 3268 case AArch64::STURBBi: 3269 case AArch64::LDURSBWi: 3270 case AArch64::LDURSBXi: 3271 return Opcode; 3272 case AArch64::LDRQui: 3273 return AArch64::LDURQi; 3274 case AArch64::STRQui: 3275 return AArch64::STURQi; 3276 case AArch64::LDRDui: 3277 return AArch64::LDURDi; 3278 case AArch64::STRDui: 3279 return AArch64::STURDi; 3280 case AArch64::LDRXui: 3281 return AArch64::LDURXi; 3282 case AArch64::STRXui: 3283 return AArch64::STURXi; 3284 case AArch64::LDRWui: 3285 return AArch64::LDURWi; 3286 case AArch64::LDRSWui: 3287 return AArch64::LDURSWi; 3288 case AArch64::STRWui: 3289 return AArch64::STURWi; 3290 case AArch64::LDRHui: 3291 return AArch64::LDURHi; 3292 case AArch64::STRHui: 3293 return AArch64::STURHi; 3294 case AArch64::LDRHHui: 3295 return AArch64::LDURHHi; 3296 case AArch64::STRHHui: 3297 return AArch64::STURHHi; 3298 case AArch64::LDRSHXui: 3299 return AArch64::LDURSHXi; 3300 case AArch64::LDRSHWui: 3301 return AArch64::LDURSHWi; 3302 case AArch64::LDRBBui: 3303 return AArch64::LDURBBi; 3304 case AArch64::LDRBui: 3305 return AArch64::LDURBi; 3306 case AArch64::STRBBui: 3307 return AArch64::STURBBi; 3308 case AArch64::STRBui: 3309 return AArch64::STURBi; 3310 case AArch64::LDRSBWui: 3311 return AArch64::LDURSBWi; 3312 case AArch64::LDRSBXui: 3313 return AArch64::LDURSBXi; 3314 } 3315 } 3316 3317 // Given the opcode of a memory load/store instruction, return the opcode of an 3318 // instruction performing the same operation, but using 3319 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the 3320 // offset register. 3321 static unsigned offsetExtendOpcode(unsigned Opcode) { 3322 switch (Opcode) { 3323 default: 3324 llvm_unreachable("Address folding not implemented for instruction"); 3325 3326 case AArch64::LDRQroX: 3327 case AArch64::LDURQi: 3328 case AArch64::LDRQui: 3329 return AArch64::LDRQroW; 3330 case AArch64::STRQroX: 3331 case AArch64::STURQi: 3332 case AArch64::STRQui: 3333 return AArch64::STRQroW; 3334 case AArch64::LDRDroX: 3335 case AArch64::LDURDi: 3336 case AArch64::LDRDui: 3337 return AArch64::LDRDroW; 3338 case AArch64::STRDroX: 3339 case AArch64::STURDi: 3340 case AArch64::STRDui: 3341 return AArch64::STRDroW; 3342 case AArch64::LDRXroX: 3343 case AArch64::LDURXi: 3344 case AArch64::LDRXui: 3345 return AArch64::LDRXroW; 3346 case AArch64::STRXroX: 3347 case AArch64::STURXi: 3348 case AArch64::STRXui: 3349 return AArch64::STRXroW; 3350 case AArch64::LDRWroX: 3351 case AArch64::LDURWi: 3352 case AArch64::LDRWui: 3353 return AArch64::LDRWroW; 3354 case AArch64::LDRSWroX: 3355 case AArch64::LDURSWi: 3356 case AArch64::LDRSWui: 3357 return AArch64::LDRSWroW; 3358 case AArch64::STRWroX: 3359 case AArch64::STURWi: 3360 case AArch64::STRWui: 3361 return AArch64::STRWroW; 3362 case AArch64::LDRHroX: 3363 case AArch64::LDURHi: 3364 case AArch64::LDRHui: 3365 return AArch64::LDRHroW; 3366 case AArch64::STRHroX: 3367 case AArch64::STURHi: 3368 case AArch64::STRHui: 3369 return AArch64::STRHroW; 3370 case AArch64::LDRHHroX: 3371 case AArch64::LDURHHi: 3372 case AArch64::LDRHHui: 3373 return AArch64::LDRHHroW; 3374 case AArch64::STRHHroX: 3375 case AArch64::STURHHi: 3376 case AArch64::STRHHui: 3377 return AArch64::STRHHroW; 3378 case AArch64::LDRSHXroX: 3379 case AArch64::LDURSHXi: 3380 case AArch64::LDRSHXui: 3381 return AArch64::LDRSHXroW; 3382 case AArch64::LDRSHWroX: 3383 case AArch64::LDURSHWi: 3384 case AArch64::LDRSHWui: 3385 return AArch64::LDRSHWroW; 3386 case AArch64::LDRBroX: 3387 case AArch64::LDURBi: 3388 case AArch64::LDRBui: 3389 return AArch64::LDRBroW; 3390 case AArch64::LDRBBroX: 3391 case AArch64::LDURBBi: 3392 case AArch64::LDRBBui: 3393 return AArch64::LDRBBroW; 3394 case AArch64::LDRSBXroX: 3395 case AArch64::LDURSBXi: 3396 case AArch64::LDRSBXui: 3397 return AArch64::LDRSBXroW; 3398 case AArch64::LDRSBWroX: 3399 case AArch64::LDURSBWi: 3400 case AArch64::LDRSBWui: 3401 return AArch64::LDRSBWroW; 3402 case AArch64::STRBroX: 3403 case AArch64::STURBi: 3404 case AArch64::STRBui: 3405 return AArch64::STRBroW; 3406 case AArch64::STRBBroX: 3407 case AArch64::STURBBi: 3408 case AArch64::STRBBui: 3409 return AArch64::STRBBroW; 3410 } 3411 } 3412 3413 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, 3414 const ExtAddrMode &AM) const { 3415 3416 const DebugLoc &DL = MemI.getDebugLoc(); 3417 MachineBasicBlock &MBB = *MemI.getParent(); 3418 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo(); 3419 3420 if (AM.Form == ExtAddrMode::Formula::Basic) { 3421 if (AM.ScaledReg) { 3422 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`. 3423 unsigned Opcode = regOffsetOpcode(MemI.getOpcode()); 3424 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3425 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3426 .addReg(MemI.getOperand(0).getReg(), 3427 MemI.mayLoad() ? RegState::Define : 0) 3428 .addReg(AM.BaseReg) 3429 .addReg(AM.ScaledReg) 3430 .addImm(0) 3431 .addImm(AM.Scale > 1) 3432 .setMemRefs(MemI.memoperands()) 3433 .setMIFlags(MemI.getFlags()); 3434 return B.getInstr(); 3435 } 3436 3437 assert(AM.ScaledReg == 0 && AM.Scale == 0 && 3438 "Addressing mode not supported for folding"); 3439 3440 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`. 3441 unsigned Scale = 1; 3442 unsigned Opcode = MemI.getOpcode(); 3443 if (isInt<9>(AM.Displacement)) 3444 Opcode = unscaledOffsetOpcode(Opcode); 3445 else 3446 Opcode = scaledOffsetOpcode(Opcode, Scale); 3447 3448 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3449 .addReg(MemI.getOperand(0).getReg(), 3450 MemI.mayLoad() ? RegState::Define : 0) 3451 .addReg(AM.BaseReg) 3452 .addImm(AM.Displacement / Scale) 3453 .setMemRefs(MemI.memoperands()) 3454 .setMIFlags(MemI.getFlags()); 3455 return B.getInstr(); 3456 } 3457 3458 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg || 3459 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) { 3460 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`. 3461 assert(AM.ScaledReg && !AM.Displacement && 3462 "Address offset can be a register or an immediate, but not both"); 3463 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode()); 3464 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3465 // Make sure the offset register is in the correct register class. 3466 Register OffsetReg = AM.ScaledReg; 3467 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg); 3468 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) { 3469 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3470 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg) 3471 .addReg(AM.ScaledReg, 0, AArch64::sub_32); 3472 } 3473 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3474 .addReg(MemI.getOperand(0).getReg(), 3475 MemI.mayLoad() ? RegState::Define : 0) 3476 .addReg(AM.BaseReg) 3477 .addReg(OffsetReg) 3478 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg) 3479 .addImm(AM.Scale != 1) 3480 .setMemRefs(MemI.memoperands()) 3481 .setMIFlags(MemI.getFlags()); 3482 3483 return B.getInstr(); 3484 } 3485 3486 llvm_unreachable( 3487 "Function must not be called with an addressing mode it can't handle"); 3488 } 3489 3490 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 3491 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 3492 bool &OffsetIsScalable, TypeSize &Width, 3493 const TargetRegisterInfo *TRI) const { 3494 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3495 // Handle only loads/stores with base register followed by immediate offset. 3496 if (LdSt.getNumExplicitOperands() == 3) { 3497 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 3498 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 3499 !LdSt.getOperand(2).isImm()) 3500 return false; 3501 } else if (LdSt.getNumExplicitOperands() == 4) { 3502 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 3503 if (!LdSt.getOperand(1).isReg() || 3504 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 3505 !LdSt.getOperand(3).isImm()) 3506 return false; 3507 } else 3508 return false; 3509 3510 // Get the scaling factor for the instruction and set the width for the 3511 // instruction. 3512 TypeSize Scale(0U, false); 3513 int64_t Dummy1, Dummy2; 3514 3515 // If this returns false, then it's an instruction we don't want to handle. 3516 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 3517 return false; 3518 3519 // Compute the offset. Offset is calculated as the immediate operand 3520 // multiplied by the scaling factor. Unscaled instructions have scaling factor 3521 // set to 1. 3522 if (LdSt.getNumExplicitOperands() == 3) { 3523 BaseOp = &LdSt.getOperand(1); 3524 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 3525 } else { 3526 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 3527 BaseOp = &LdSt.getOperand(2); 3528 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 3529 } 3530 OffsetIsScalable = Scale.isScalable(); 3531 3532 if (!BaseOp->isReg() && !BaseOp->isFI()) 3533 return false; 3534 3535 return true; 3536 } 3537 3538 MachineOperand & 3539 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 3540 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3541 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 3542 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 3543 return OfsOp; 3544 } 3545 3546 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 3547 TypeSize &Width, int64_t &MinOffset, 3548 int64_t &MaxOffset) { 3549 switch (Opcode) { 3550 // Not a memory operation or something we want to handle. 3551 default: 3552 Scale = TypeSize::getFixed(0); 3553 Width = TypeSize::getFixed(0); 3554 MinOffset = MaxOffset = 0; 3555 return false; 3556 // LDR / STR 3557 case AArch64::LDRQui: 3558 case AArch64::STRQui: 3559 Scale = TypeSize::getFixed(16); 3560 Width = TypeSize::getFixed(16); 3561 MinOffset = 0; 3562 MaxOffset = 4095; 3563 break; 3564 case AArch64::LDRXui: 3565 case AArch64::LDRDui: 3566 case AArch64::STRXui: 3567 case AArch64::STRDui: 3568 case AArch64::PRFMui: 3569 Scale = TypeSize::getFixed(8); 3570 Width = TypeSize::getFixed(8); 3571 MinOffset = 0; 3572 MaxOffset = 4095; 3573 break; 3574 case AArch64::LDRWui: 3575 case AArch64::LDRSui: 3576 case AArch64::LDRSWui: 3577 case AArch64::STRWui: 3578 case AArch64::STRSui: 3579 Scale = TypeSize::getFixed(4); 3580 Width = TypeSize::getFixed(4); 3581 MinOffset = 0; 3582 MaxOffset = 4095; 3583 break; 3584 case AArch64::LDRHui: 3585 case AArch64::LDRHHui: 3586 case AArch64::LDRSHWui: 3587 case AArch64::LDRSHXui: 3588 case AArch64::STRHui: 3589 case AArch64::STRHHui: 3590 Scale = TypeSize::getFixed(2); 3591 Width = TypeSize::getFixed(2); 3592 MinOffset = 0; 3593 MaxOffset = 4095; 3594 break; 3595 case AArch64::LDRBui: 3596 case AArch64::LDRBBui: 3597 case AArch64::LDRSBWui: 3598 case AArch64::LDRSBXui: 3599 case AArch64::STRBui: 3600 case AArch64::STRBBui: 3601 Scale = TypeSize::getFixed(1); 3602 Width = TypeSize::getFixed(1); 3603 MinOffset = 0; 3604 MaxOffset = 4095; 3605 break; 3606 // post/pre inc 3607 case AArch64::STRQpre: 3608 case AArch64::LDRQpost: 3609 Scale = TypeSize::getFixed(1); 3610 Width = TypeSize::getFixed(16); 3611 MinOffset = -256; 3612 MaxOffset = 255; 3613 break; 3614 case AArch64::STRXpre: 3615 case AArch64::STRDpre: 3616 case AArch64::LDRXpost: 3617 case AArch64::LDRDpost: 3618 Scale = TypeSize::getFixed(1); 3619 Width = TypeSize::getFixed(8); 3620 MinOffset = -256; 3621 MaxOffset = 255; 3622 break; 3623 case AArch64::STRWpost: 3624 case AArch64::LDRWpost: 3625 Scale = TypeSize::getFixed(4); 3626 Width = TypeSize::getFixed(32); 3627 MinOffset = -256; 3628 MaxOffset = 255; 3629 break; 3630 // Unscaled 3631 case AArch64::LDURQi: 3632 case AArch64::STURQi: 3633 Scale = TypeSize::getFixed(1); 3634 Width = TypeSize::getFixed(16); 3635 MinOffset = -256; 3636 MaxOffset = 255; 3637 break; 3638 case AArch64::LDURXi: 3639 case AArch64::LDURDi: 3640 case AArch64::LDAPURXi: 3641 case AArch64::STURXi: 3642 case AArch64::STURDi: 3643 case AArch64::STLURXi: 3644 case AArch64::PRFUMi: 3645 Scale = TypeSize::getFixed(1); 3646 Width = TypeSize::getFixed(8); 3647 MinOffset = -256; 3648 MaxOffset = 255; 3649 break; 3650 case AArch64::LDURWi: 3651 case AArch64::LDURSi: 3652 case AArch64::LDURSWi: 3653 case AArch64::LDAPURi: 3654 case AArch64::LDAPURSWi: 3655 case AArch64::STURWi: 3656 case AArch64::STURSi: 3657 case AArch64::STLURWi: 3658 Scale = TypeSize::getFixed(1); 3659 Width = TypeSize::getFixed(4); 3660 MinOffset = -256; 3661 MaxOffset = 255; 3662 break; 3663 case AArch64::LDURHi: 3664 case AArch64::LDURHHi: 3665 case AArch64::LDURSHXi: 3666 case AArch64::LDURSHWi: 3667 case AArch64::LDAPURHi: 3668 case AArch64::LDAPURSHWi: 3669 case AArch64::LDAPURSHXi: 3670 case AArch64::STURHi: 3671 case AArch64::STURHHi: 3672 case AArch64::STLURHi: 3673 Scale = TypeSize::getFixed(1); 3674 Width = TypeSize::getFixed(2); 3675 MinOffset = -256; 3676 MaxOffset = 255; 3677 break; 3678 case AArch64::LDURBi: 3679 case AArch64::LDURBBi: 3680 case AArch64::LDURSBXi: 3681 case AArch64::LDURSBWi: 3682 case AArch64::LDAPURBi: 3683 case AArch64::LDAPURSBWi: 3684 case AArch64::LDAPURSBXi: 3685 case AArch64::STURBi: 3686 case AArch64::STURBBi: 3687 case AArch64::STLURBi: 3688 Scale = TypeSize::getFixed(1); 3689 Width = TypeSize::getFixed(1); 3690 MinOffset = -256; 3691 MaxOffset = 255; 3692 break; 3693 // LDP / STP 3694 case AArch64::LDPQi: 3695 case AArch64::LDNPQi: 3696 case AArch64::STPQi: 3697 case AArch64::STNPQi: 3698 Scale = TypeSize::getFixed(16); 3699 Width = TypeSize::getFixed(32); 3700 MinOffset = -64; 3701 MaxOffset = 63; 3702 break; 3703 case AArch64::LDPXi: 3704 case AArch64::LDPDi: 3705 case AArch64::LDNPXi: 3706 case AArch64::LDNPDi: 3707 case AArch64::STPXi: 3708 case AArch64::STPDi: 3709 case AArch64::STNPXi: 3710 case AArch64::STNPDi: 3711 Scale = TypeSize::getFixed(8); 3712 Width = TypeSize::getFixed(16); 3713 MinOffset = -64; 3714 MaxOffset = 63; 3715 break; 3716 case AArch64::LDPWi: 3717 case AArch64::LDPSi: 3718 case AArch64::LDNPWi: 3719 case AArch64::LDNPSi: 3720 case AArch64::STPWi: 3721 case AArch64::STPSi: 3722 case AArch64::STNPWi: 3723 case AArch64::STNPSi: 3724 Scale = TypeSize::getFixed(4); 3725 Width = TypeSize::getFixed(8); 3726 MinOffset = -64; 3727 MaxOffset = 63; 3728 break; 3729 // pre/post inc 3730 case AArch64::STPQpre: 3731 case AArch64::LDPQpost: 3732 Scale = TypeSize::getFixed(16); 3733 Width = TypeSize::getFixed(16); 3734 MinOffset = -1024; 3735 MaxOffset = 1008; 3736 break; 3737 case AArch64::STPXpre: 3738 case AArch64::LDPXpost: 3739 case AArch64::STPDpre: 3740 case AArch64::LDPDpost: 3741 Scale = TypeSize::getFixed(8); 3742 Width = TypeSize::getFixed(8); 3743 MinOffset = -512; 3744 MaxOffset = 504; 3745 break; 3746 case AArch64::StoreSwiftAsyncContext: 3747 // Store is an STRXui, but there might be an ADDXri in the expansion too. 3748 Scale = TypeSize::getFixed(1); 3749 Width = TypeSize::getFixed(8); 3750 MinOffset = 0; 3751 MaxOffset = 4095; 3752 break; 3753 case AArch64::ADDG: 3754 Scale = TypeSize::getFixed(16); 3755 Width = TypeSize::getFixed(0); 3756 MinOffset = 0; 3757 MaxOffset = 63; 3758 break; 3759 case AArch64::TAGPstack: 3760 Scale = TypeSize::getFixed(16); 3761 Width = TypeSize::getFixed(0); 3762 // TAGP with a negative offset turns into SUBP, which has a maximum offset 3763 // of 63 (not 64!). 3764 MinOffset = -63; 3765 MaxOffset = 63; 3766 break; 3767 case AArch64::LDG: 3768 case AArch64::STGi: 3769 case AArch64::STZGi: 3770 Scale = TypeSize::getFixed(16); 3771 Width = TypeSize::getFixed(16); 3772 MinOffset = -256; 3773 MaxOffset = 255; 3774 break; 3775 // SVE 3776 case AArch64::STR_ZZZZXI: 3777 case AArch64::LDR_ZZZZXI: 3778 Scale = TypeSize::getScalable(16); 3779 Width = TypeSize::getScalable(16 * 4); 3780 MinOffset = -256; 3781 MaxOffset = 252; 3782 break; 3783 case AArch64::STR_ZZZXI: 3784 case AArch64::LDR_ZZZXI: 3785 Scale = TypeSize::getScalable(16); 3786 Width = TypeSize::getScalable(16 * 3); 3787 MinOffset = -256; 3788 MaxOffset = 253; 3789 break; 3790 case AArch64::STR_ZZXI: 3791 case AArch64::LDR_ZZXI: 3792 Scale = TypeSize::getScalable(16); 3793 Width = TypeSize::getScalable(16 * 2); 3794 MinOffset = -256; 3795 MaxOffset = 254; 3796 break; 3797 case AArch64::LDR_PXI: 3798 case AArch64::STR_PXI: 3799 Scale = TypeSize::getScalable(2); 3800 Width = TypeSize::getScalable(2); 3801 MinOffset = -256; 3802 MaxOffset = 255; 3803 break; 3804 case AArch64::LDR_PPXI: 3805 case AArch64::STR_PPXI: 3806 Scale = TypeSize::getScalable(2); 3807 Width = TypeSize::getScalable(2 * 2); 3808 MinOffset = -256; 3809 MaxOffset = 254; 3810 break; 3811 case AArch64::LDR_ZXI: 3812 case AArch64::STR_ZXI: 3813 Scale = TypeSize::getScalable(16); 3814 Width = TypeSize::getScalable(16); 3815 MinOffset = -256; 3816 MaxOffset = 255; 3817 break; 3818 case AArch64::LD1B_IMM: 3819 case AArch64::LD1H_IMM: 3820 case AArch64::LD1W_IMM: 3821 case AArch64::LD1D_IMM: 3822 case AArch64::LDNT1B_ZRI: 3823 case AArch64::LDNT1H_ZRI: 3824 case AArch64::LDNT1W_ZRI: 3825 case AArch64::LDNT1D_ZRI: 3826 case AArch64::ST1B_IMM: 3827 case AArch64::ST1H_IMM: 3828 case AArch64::ST1W_IMM: 3829 case AArch64::ST1D_IMM: 3830 case AArch64::STNT1B_ZRI: 3831 case AArch64::STNT1H_ZRI: 3832 case AArch64::STNT1W_ZRI: 3833 case AArch64::STNT1D_ZRI: 3834 case AArch64::LDNF1B_IMM: 3835 case AArch64::LDNF1H_IMM: 3836 case AArch64::LDNF1W_IMM: 3837 case AArch64::LDNF1D_IMM: 3838 // A full vectors worth of data 3839 // Width = mbytes * elements 3840 Scale = TypeSize::getScalable(16); 3841 Width = TypeSize::getScalable(16); 3842 MinOffset = -8; 3843 MaxOffset = 7; 3844 break; 3845 case AArch64::LD2B_IMM: 3846 case AArch64::LD2H_IMM: 3847 case AArch64::LD2W_IMM: 3848 case AArch64::LD2D_IMM: 3849 case AArch64::ST2B_IMM: 3850 case AArch64::ST2H_IMM: 3851 case AArch64::ST2W_IMM: 3852 case AArch64::ST2D_IMM: 3853 Scale = TypeSize::getScalable(32); 3854 Width = TypeSize::getScalable(16 * 2); 3855 MinOffset = -8; 3856 MaxOffset = 7; 3857 break; 3858 case AArch64::LD3B_IMM: 3859 case AArch64::LD3H_IMM: 3860 case AArch64::LD3W_IMM: 3861 case AArch64::LD3D_IMM: 3862 case AArch64::ST3B_IMM: 3863 case AArch64::ST3H_IMM: 3864 case AArch64::ST3W_IMM: 3865 case AArch64::ST3D_IMM: 3866 Scale = TypeSize::getScalable(48); 3867 Width = TypeSize::getScalable(16 * 3); 3868 MinOffset = -8; 3869 MaxOffset = 7; 3870 break; 3871 case AArch64::LD4B_IMM: 3872 case AArch64::LD4H_IMM: 3873 case AArch64::LD4W_IMM: 3874 case AArch64::LD4D_IMM: 3875 case AArch64::ST4B_IMM: 3876 case AArch64::ST4H_IMM: 3877 case AArch64::ST4W_IMM: 3878 case AArch64::ST4D_IMM: 3879 Scale = TypeSize::getScalable(64); 3880 Width = TypeSize::getScalable(16 * 4); 3881 MinOffset = -8; 3882 MaxOffset = 7; 3883 break; 3884 case AArch64::LD1B_H_IMM: 3885 case AArch64::LD1SB_H_IMM: 3886 case AArch64::LD1H_S_IMM: 3887 case AArch64::LD1SH_S_IMM: 3888 case AArch64::LD1W_D_IMM: 3889 case AArch64::LD1SW_D_IMM: 3890 case AArch64::ST1B_H_IMM: 3891 case AArch64::ST1H_S_IMM: 3892 case AArch64::ST1W_D_IMM: 3893 case AArch64::LDNF1B_H_IMM: 3894 case AArch64::LDNF1SB_H_IMM: 3895 case AArch64::LDNF1H_S_IMM: 3896 case AArch64::LDNF1SH_S_IMM: 3897 case AArch64::LDNF1W_D_IMM: 3898 case AArch64::LDNF1SW_D_IMM: 3899 // A half vector worth of data 3900 // Width = mbytes * elements 3901 Scale = TypeSize::getScalable(8); 3902 Width = TypeSize::getScalable(8); 3903 MinOffset = -8; 3904 MaxOffset = 7; 3905 break; 3906 case AArch64::LD1B_S_IMM: 3907 case AArch64::LD1SB_S_IMM: 3908 case AArch64::LD1H_D_IMM: 3909 case AArch64::LD1SH_D_IMM: 3910 case AArch64::ST1B_S_IMM: 3911 case AArch64::ST1H_D_IMM: 3912 case AArch64::LDNF1B_S_IMM: 3913 case AArch64::LDNF1SB_S_IMM: 3914 case AArch64::LDNF1H_D_IMM: 3915 case AArch64::LDNF1SH_D_IMM: 3916 // A quarter vector worth of data 3917 // Width = mbytes * elements 3918 Scale = TypeSize::getScalable(4); 3919 Width = TypeSize::getScalable(4); 3920 MinOffset = -8; 3921 MaxOffset = 7; 3922 break; 3923 case AArch64::LD1B_D_IMM: 3924 case AArch64::LD1SB_D_IMM: 3925 case AArch64::ST1B_D_IMM: 3926 case AArch64::LDNF1B_D_IMM: 3927 case AArch64::LDNF1SB_D_IMM: 3928 // A eighth vector worth of data 3929 // Width = mbytes * elements 3930 Scale = TypeSize::getScalable(2); 3931 Width = TypeSize::getScalable(2); 3932 MinOffset = -8; 3933 MaxOffset = 7; 3934 break; 3935 case AArch64::ST2Gi: 3936 case AArch64::STZ2Gi: 3937 Scale = TypeSize::getFixed(16); 3938 Width = TypeSize::getFixed(32); 3939 MinOffset = -256; 3940 MaxOffset = 255; 3941 break; 3942 case AArch64::STGPi: 3943 Scale = TypeSize::getFixed(16); 3944 Width = TypeSize::getFixed(16); 3945 MinOffset = -64; 3946 MaxOffset = 63; 3947 break; 3948 case AArch64::LD1RB_IMM: 3949 case AArch64::LD1RB_H_IMM: 3950 case AArch64::LD1RB_S_IMM: 3951 case AArch64::LD1RB_D_IMM: 3952 case AArch64::LD1RSB_H_IMM: 3953 case AArch64::LD1RSB_S_IMM: 3954 case AArch64::LD1RSB_D_IMM: 3955 Scale = TypeSize::getFixed(1); 3956 Width = TypeSize::getFixed(1); 3957 MinOffset = 0; 3958 MaxOffset = 63; 3959 break; 3960 case AArch64::LD1RH_IMM: 3961 case AArch64::LD1RH_S_IMM: 3962 case AArch64::LD1RH_D_IMM: 3963 case AArch64::LD1RSH_S_IMM: 3964 case AArch64::LD1RSH_D_IMM: 3965 Scale = TypeSize::getFixed(2); 3966 Width = TypeSize::getFixed(2); 3967 MinOffset = 0; 3968 MaxOffset = 63; 3969 break; 3970 case AArch64::LD1RW_IMM: 3971 case AArch64::LD1RW_D_IMM: 3972 case AArch64::LD1RSW_IMM: 3973 Scale = TypeSize::getFixed(4); 3974 Width = TypeSize::getFixed(4); 3975 MinOffset = 0; 3976 MaxOffset = 63; 3977 break; 3978 case AArch64::LD1RD_IMM: 3979 Scale = TypeSize::getFixed(8); 3980 Width = TypeSize::getFixed(8); 3981 MinOffset = 0; 3982 MaxOffset = 63; 3983 break; 3984 } 3985 3986 return true; 3987 } 3988 3989 // Scaling factor for unscaled load or store. 3990 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3991 switch (Opc) { 3992 default: 3993 llvm_unreachable("Opcode has unknown scale!"); 3994 case AArch64::LDRBBui: 3995 case AArch64::LDURBBi: 3996 case AArch64::LDRSBWui: 3997 case AArch64::LDURSBWi: 3998 case AArch64::STRBBui: 3999 case AArch64::STURBBi: 4000 return 1; 4001 case AArch64::LDRHHui: 4002 case AArch64::LDURHHi: 4003 case AArch64::LDRSHWui: 4004 case AArch64::LDURSHWi: 4005 case AArch64::STRHHui: 4006 case AArch64::STURHHi: 4007 return 2; 4008 case AArch64::LDRSui: 4009 case AArch64::LDURSi: 4010 case AArch64::LDRSpre: 4011 case AArch64::LDRSWui: 4012 case AArch64::LDURSWi: 4013 case AArch64::LDRSWpre: 4014 case AArch64::LDRWpre: 4015 case AArch64::LDRWui: 4016 case AArch64::LDURWi: 4017 case AArch64::STRSui: 4018 case AArch64::STURSi: 4019 case AArch64::STRSpre: 4020 case AArch64::STRWui: 4021 case AArch64::STURWi: 4022 case AArch64::STRWpre: 4023 case AArch64::LDPSi: 4024 case AArch64::LDPSWi: 4025 case AArch64::LDPWi: 4026 case AArch64::STPSi: 4027 case AArch64::STPWi: 4028 return 4; 4029 case AArch64::LDRDui: 4030 case AArch64::LDURDi: 4031 case AArch64::LDRDpre: 4032 case AArch64::LDRXui: 4033 case AArch64::LDURXi: 4034 case AArch64::LDRXpre: 4035 case AArch64::STRDui: 4036 case AArch64::STURDi: 4037 case AArch64::STRDpre: 4038 case AArch64::STRXui: 4039 case AArch64::STURXi: 4040 case AArch64::STRXpre: 4041 case AArch64::LDPDi: 4042 case AArch64::LDPXi: 4043 case AArch64::STPDi: 4044 case AArch64::STPXi: 4045 return 8; 4046 case AArch64::LDRQui: 4047 case AArch64::LDURQi: 4048 case AArch64::STRQui: 4049 case AArch64::STURQi: 4050 case AArch64::STRQpre: 4051 case AArch64::LDPQi: 4052 case AArch64::LDRQpre: 4053 case AArch64::STPQi: 4054 case AArch64::STGi: 4055 case AArch64::STZGi: 4056 case AArch64::ST2Gi: 4057 case AArch64::STZ2Gi: 4058 case AArch64::STGPi: 4059 return 16; 4060 } 4061 } 4062 4063 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 4064 switch (MI.getOpcode()) { 4065 default: 4066 return false; 4067 case AArch64::LDRWpre: 4068 case AArch64::LDRXpre: 4069 case AArch64::LDRSWpre: 4070 case AArch64::LDRSpre: 4071 case AArch64::LDRDpre: 4072 case AArch64::LDRQpre: 4073 return true; 4074 } 4075 } 4076 4077 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 4078 switch (MI.getOpcode()) { 4079 default: 4080 return false; 4081 case AArch64::STRWpre: 4082 case AArch64::STRXpre: 4083 case AArch64::STRSpre: 4084 case AArch64::STRDpre: 4085 case AArch64::STRQpre: 4086 return true; 4087 } 4088 } 4089 4090 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 4091 return isPreLd(MI) || isPreSt(MI); 4092 } 4093 4094 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 4095 switch (MI.getOpcode()) { 4096 default: 4097 return false; 4098 case AArch64::LDPSi: 4099 case AArch64::LDPSWi: 4100 case AArch64::LDPDi: 4101 case AArch64::LDPQi: 4102 case AArch64::LDPWi: 4103 case AArch64::LDPXi: 4104 case AArch64::STPSi: 4105 case AArch64::STPDi: 4106 case AArch64::STPQi: 4107 case AArch64::STPWi: 4108 case AArch64::STPXi: 4109 case AArch64::STGPi: 4110 return true; 4111 } 4112 } 4113 4114 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 4115 unsigned Idx = 4116 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 4117 : 1; 4118 return MI.getOperand(Idx); 4119 } 4120 4121 const MachineOperand & 4122 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 4123 unsigned Idx = 4124 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 4125 : 2; 4126 return MI.getOperand(Idx); 4127 } 4128 4129 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 4130 Register Reg) { 4131 if (MI.getParent() == nullptr) 4132 return nullptr; 4133 const MachineFunction *MF = MI.getParent()->getParent(); 4134 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 4135 } 4136 4137 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 4138 auto IsHFPR = [&](const MachineOperand &Op) { 4139 if (!Op.isReg()) 4140 return false; 4141 auto Reg = Op.getReg(); 4142 if (Reg.isPhysical()) 4143 return AArch64::FPR16RegClass.contains(Reg); 4144 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4145 return TRC == &AArch64::FPR16RegClass || 4146 TRC == &AArch64::FPR16_loRegClass; 4147 }; 4148 return llvm::any_of(MI.operands(), IsHFPR); 4149 } 4150 4151 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 4152 auto IsQFPR = [&](const MachineOperand &Op) { 4153 if (!Op.isReg()) 4154 return false; 4155 auto Reg = Op.getReg(); 4156 if (Reg.isPhysical()) 4157 return AArch64::FPR128RegClass.contains(Reg); 4158 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4159 return TRC == &AArch64::FPR128RegClass || 4160 TRC == &AArch64::FPR128_loRegClass; 4161 }; 4162 return llvm::any_of(MI.operands(), IsQFPR); 4163 } 4164 4165 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) { 4166 switch (MI.getOpcode()) { 4167 case AArch64::BRK: 4168 case AArch64::HLT: 4169 case AArch64::PACIASP: 4170 case AArch64::PACIBSP: 4171 // Implicit BTI behavior. 4172 return true; 4173 case AArch64::PAUTH_PROLOGUE: 4174 // PAUTH_PROLOGUE expands to PACI(A|B)SP. 4175 return true; 4176 case AArch64::HINT: { 4177 unsigned Imm = MI.getOperand(0).getImm(); 4178 // Explicit BTI instruction. 4179 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 4180 return true; 4181 // PACI(A|B)SP instructions. 4182 if (Imm == 25 || Imm == 27) 4183 return true; 4184 return false; 4185 } 4186 default: 4187 return false; 4188 } 4189 } 4190 4191 bool AArch64InstrInfo::isFpOrNEON(Register Reg) { 4192 if (Reg == 0) 4193 return false; 4194 assert(Reg.isPhysical() && "Expected physical register in isFpOrNEON"); 4195 return AArch64::FPR128RegClass.contains(Reg) || 4196 AArch64::FPR64RegClass.contains(Reg) || 4197 AArch64::FPR32RegClass.contains(Reg) || 4198 AArch64::FPR16RegClass.contains(Reg) || 4199 AArch64::FPR8RegClass.contains(Reg); 4200 } 4201 4202 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 4203 auto IsFPR = [&](const MachineOperand &Op) { 4204 if (!Op.isReg()) 4205 return false; 4206 auto Reg = Op.getReg(); 4207 if (Reg.isPhysical()) 4208 return isFpOrNEON(Reg); 4209 4210 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4211 return TRC == &AArch64::FPR128RegClass || 4212 TRC == &AArch64::FPR128_loRegClass || 4213 TRC == &AArch64::FPR64RegClass || 4214 TRC == &AArch64::FPR64_loRegClass || 4215 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 4216 TRC == &AArch64::FPR8RegClass; 4217 }; 4218 return llvm::any_of(MI.operands(), IsFPR); 4219 } 4220 4221 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 4222 // scaled. 4223 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 4224 int Scale = AArch64InstrInfo::getMemScale(Opc); 4225 4226 // If the byte-offset isn't a multiple of the stride, we can't scale this 4227 // offset. 4228 if (Offset % Scale != 0) 4229 return false; 4230 4231 // Convert the byte-offset used by unscaled into an "element" offset used 4232 // by the scaled pair load/store instructions. 4233 Offset /= Scale; 4234 return true; 4235 } 4236 4237 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 4238 if (FirstOpc == SecondOpc) 4239 return true; 4240 // We can also pair sign-ext and zero-ext instructions. 4241 switch (FirstOpc) { 4242 default: 4243 return false; 4244 case AArch64::STRSui: 4245 case AArch64::STURSi: 4246 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi; 4247 case AArch64::STRDui: 4248 case AArch64::STURDi: 4249 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi; 4250 case AArch64::STRQui: 4251 case AArch64::STURQi: 4252 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi; 4253 case AArch64::STRWui: 4254 case AArch64::STURWi: 4255 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi; 4256 case AArch64::STRXui: 4257 case AArch64::STURXi: 4258 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi; 4259 case AArch64::LDRSui: 4260 case AArch64::LDURSi: 4261 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi; 4262 case AArch64::LDRDui: 4263 case AArch64::LDURDi: 4264 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi; 4265 case AArch64::LDRQui: 4266 case AArch64::LDURQi: 4267 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi; 4268 case AArch64::LDRWui: 4269 case AArch64::LDURWi: 4270 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 4271 case AArch64::LDRSWui: 4272 case AArch64::LDURSWi: 4273 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 4274 case AArch64::LDRXui: 4275 case AArch64::LDURXi: 4276 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi; 4277 } 4278 // These instructions can't be paired based on their opcodes. 4279 return false; 4280 } 4281 4282 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 4283 int64_t Offset1, unsigned Opcode1, int FI2, 4284 int64_t Offset2, unsigned Opcode2) { 4285 // Accesses through fixed stack object frame indices may access a different 4286 // fixed stack slot. Check that the object offsets + offsets match. 4287 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 4288 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 4289 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 4290 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 4291 // Convert to scaled object offsets. 4292 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 4293 if (ObjectOffset1 % Scale1 != 0) 4294 return false; 4295 ObjectOffset1 /= Scale1; 4296 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 4297 if (ObjectOffset2 % Scale2 != 0) 4298 return false; 4299 ObjectOffset2 /= Scale2; 4300 ObjectOffset1 += Offset1; 4301 ObjectOffset2 += Offset2; 4302 return ObjectOffset1 + 1 == ObjectOffset2; 4303 } 4304 4305 return FI1 == FI2; 4306 } 4307 4308 /// Detect opportunities for ldp/stp formation. 4309 /// 4310 /// Only called for LdSt for which getMemOperandWithOffset returns true. 4311 bool AArch64InstrInfo::shouldClusterMemOps( 4312 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1, 4313 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, 4314 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize, 4315 unsigned NumBytes) const { 4316 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 4317 const MachineOperand &BaseOp1 = *BaseOps1.front(); 4318 const MachineOperand &BaseOp2 = *BaseOps2.front(); 4319 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 4320 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 4321 if (BaseOp1.getType() != BaseOp2.getType()) 4322 return false; 4323 4324 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 4325 "Only base registers and frame indices are supported."); 4326 4327 // Check for both base regs and base FI. 4328 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 4329 return false; 4330 4331 // Only cluster up to a single pair. 4332 if (ClusterSize > 2) 4333 return false; 4334 4335 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 4336 return false; 4337 4338 // Can we pair these instructions based on their opcodes? 4339 unsigned FirstOpc = FirstLdSt.getOpcode(); 4340 unsigned SecondOpc = SecondLdSt.getOpcode(); 4341 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 4342 return false; 4343 4344 // Can't merge volatiles or load/stores that have a hint to avoid pair 4345 // formation, for example. 4346 if (!isCandidateToMergeOrPair(FirstLdSt) || 4347 !isCandidateToMergeOrPair(SecondLdSt)) 4348 return false; 4349 4350 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 4351 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 4352 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 4353 return false; 4354 4355 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 4356 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 4357 return false; 4358 4359 // Pairwise instructions have a 7-bit signed offset field. 4360 if (Offset1 > 63 || Offset1 < -64) 4361 return false; 4362 4363 // The caller should already have ordered First/SecondLdSt by offset. 4364 // Note: except for non-equal frame index bases 4365 if (BaseOp1.isFI()) { 4366 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 4367 "Caller should have ordered offsets."); 4368 4369 const MachineFrameInfo &MFI = 4370 FirstLdSt.getParent()->getParent()->getFrameInfo(); 4371 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 4372 BaseOp2.getIndex(), Offset2, SecondOpc); 4373 } 4374 4375 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 4376 4377 return Offset1 + 1 == Offset2; 4378 } 4379 4380 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 4381 unsigned Reg, unsigned SubIdx, 4382 unsigned State, 4383 const TargetRegisterInfo *TRI) { 4384 if (!SubIdx) 4385 return MIB.addReg(Reg, State); 4386 4387 if (Register::isPhysicalRegister(Reg)) 4388 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 4389 return MIB.addReg(Reg, State, SubIdx); 4390 } 4391 4392 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 4393 unsigned NumRegs) { 4394 // We really want the positive remainder mod 32 here, that happens to be 4395 // easily obtainable with a mask. 4396 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 4397 } 4398 4399 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 4400 MachineBasicBlock::iterator I, 4401 const DebugLoc &DL, MCRegister DestReg, 4402 MCRegister SrcReg, bool KillSrc, 4403 unsigned Opcode, 4404 ArrayRef<unsigned> Indices) const { 4405 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 4406 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4407 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4408 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4409 unsigned NumRegs = Indices.size(); 4410 4411 int SubReg = 0, End = NumRegs, Incr = 1; 4412 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 4413 SubReg = NumRegs - 1; 4414 End = -1; 4415 Incr = -1; 4416 } 4417 4418 for (; SubReg != End; SubReg += Incr) { 4419 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4420 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4421 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 4422 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4423 } 4424 } 4425 4426 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 4427 MachineBasicBlock::iterator I, 4428 DebugLoc DL, unsigned DestReg, 4429 unsigned SrcReg, bool KillSrc, 4430 unsigned Opcode, unsigned ZeroReg, 4431 llvm::ArrayRef<unsigned> Indices) const { 4432 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4433 unsigned NumRegs = Indices.size(); 4434 4435 #ifndef NDEBUG 4436 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4437 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4438 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 4439 "GPR reg sequences should not be able to overlap"); 4440 #endif 4441 4442 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 4443 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4444 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4445 MIB.addReg(ZeroReg); 4446 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4447 MIB.addImm(0); 4448 } 4449 } 4450 4451 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 4452 MachineBasicBlock::iterator I, 4453 const DebugLoc &DL, MCRegister DestReg, 4454 MCRegister SrcReg, bool KillSrc) const { 4455 if (AArch64::GPR32spRegClass.contains(DestReg) && 4456 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 4457 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4458 4459 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 4460 // If either operand is WSP, expand to ADD #0. 4461 if (Subtarget.hasZeroCycleRegMove()) { 4462 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 4463 MCRegister DestRegX = TRI->getMatchingSuperReg( 4464 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4465 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4466 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4467 // This instruction is reading and writing X registers. This may upset 4468 // the register scavenger and machine verifier, so we need to indicate 4469 // that we are reading an undefined value from SrcRegX, but a proper 4470 // value from SrcReg. 4471 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 4472 .addReg(SrcRegX, RegState::Undef) 4473 .addImm(0) 4474 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 4475 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4476 } else { 4477 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 4478 .addReg(SrcReg, getKillRegState(KillSrc)) 4479 .addImm(0) 4480 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4481 } 4482 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 4483 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 4484 .addImm(0) 4485 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4486 } else { 4487 if (Subtarget.hasZeroCycleRegMove()) { 4488 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 4489 MCRegister DestRegX = TRI->getMatchingSuperReg( 4490 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4491 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4492 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4493 // This instruction is reading and writing X registers. This may upset 4494 // the register scavenger and machine verifier, so we need to indicate 4495 // that we are reading an undefined value from SrcRegX, but a proper 4496 // value from SrcReg. 4497 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 4498 .addReg(AArch64::XZR) 4499 .addReg(SrcRegX, RegState::Undef) 4500 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4501 } else { 4502 // Otherwise, expand to ORR WZR. 4503 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 4504 .addReg(AArch64::WZR) 4505 .addReg(SrcReg, getKillRegState(KillSrc)); 4506 } 4507 } 4508 return; 4509 } 4510 4511 // Copy a Predicate register by ORRing with itself. 4512 if (AArch64::PPRRegClass.contains(DestReg) && 4513 AArch64::PPRRegClass.contains(SrcReg)) { 4514 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4515 "Unexpected SVE register."); 4516 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 4517 .addReg(SrcReg) // Pg 4518 .addReg(SrcReg) 4519 .addReg(SrcReg, getKillRegState(KillSrc)); 4520 return; 4521 } 4522 4523 // Copy a predicate-as-counter register by ORRing with itself as if it 4524 // were a regular predicate (mask) register. 4525 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); 4526 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); 4527 if (DestIsPNR || SrcIsPNR) { 4528 auto ToPPR = [](MCRegister R) -> MCRegister { 4529 return (R - AArch64::PN0) + AArch64::P0; 4530 }; 4531 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg; 4532 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg; 4533 4534 if (PPRSrcReg != PPRDestReg) { 4535 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg) 4536 .addReg(PPRSrcReg) // Pg 4537 .addReg(PPRSrcReg) 4538 .addReg(PPRSrcReg, getKillRegState(KillSrc)); 4539 if (DestIsPNR) 4540 NewMI.addDef(DestReg, RegState::Implicit); 4541 } 4542 return; 4543 } 4544 4545 // Copy a Z register by ORRing with itself. 4546 if (AArch64::ZPRRegClass.contains(DestReg) && 4547 AArch64::ZPRRegClass.contains(SrcReg)) { 4548 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4549 "Unexpected SVE register."); 4550 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 4551 .addReg(SrcReg) 4552 .addReg(SrcReg, getKillRegState(KillSrc)); 4553 return; 4554 } 4555 4556 // Copy a Z register pair by copying the individual sub-registers. 4557 if ((AArch64::ZPR2RegClass.contains(DestReg) || 4558 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && 4559 (AArch64::ZPR2RegClass.contains(SrcReg) || 4560 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { 4561 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4562 "Unexpected SVE register."); 4563 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 4564 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4565 Indices); 4566 return; 4567 } 4568 4569 // Copy a Z register triple by copying the individual sub-registers. 4570 if (AArch64::ZPR3RegClass.contains(DestReg) && 4571 AArch64::ZPR3RegClass.contains(SrcReg)) { 4572 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4573 "Unexpected SVE register."); 4574 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4575 AArch64::zsub2}; 4576 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4577 Indices); 4578 return; 4579 } 4580 4581 // Copy a Z register quad by copying the individual sub-registers. 4582 if ((AArch64::ZPR4RegClass.contains(DestReg) || 4583 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && 4584 (AArch64::ZPR4RegClass.contains(SrcReg) || 4585 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { 4586 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4587 "Unexpected SVE register."); 4588 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4589 AArch64::zsub2, AArch64::zsub3}; 4590 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4591 Indices); 4592 return; 4593 } 4594 4595 if (AArch64::GPR64spRegClass.contains(DestReg) && 4596 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 4597 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 4598 // If either operand is SP, expand to ADD #0. 4599 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 4600 .addReg(SrcReg, getKillRegState(KillSrc)) 4601 .addImm(0) 4602 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4603 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 4604 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 4605 .addImm(0) 4606 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4607 } else { 4608 // Otherwise, expand to ORR XZR. 4609 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 4610 .addReg(AArch64::XZR) 4611 .addReg(SrcReg, getKillRegState(KillSrc)); 4612 } 4613 return; 4614 } 4615 4616 // Copy a DDDD register quad by copying the individual sub-registers. 4617 if (AArch64::DDDDRegClass.contains(DestReg) && 4618 AArch64::DDDDRegClass.contains(SrcReg)) { 4619 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4620 AArch64::dsub2, AArch64::dsub3}; 4621 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4622 Indices); 4623 return; 4624 } 4625 4626 // Copy a DDD register triple by copying the individual sub-registers. 4627 if (AArch64::DDDRegClass.contains(DestReg) && 4628 AArch64::DDDRegClass.contains(SrcReg)) { 4629 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4630 AArch64::dsub2}; 4631 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4632 Indices); 4633 return; 4634 } 4635 4636 // Copy a DD register pair by copying the individual sub-registers. 4637 if (AArch64::DDRegClass.contains(DestReg) && 4638 AArch64::DDRegClass.contains(SrcReg)) { 4639 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 4640 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4641 Indices); 4642 return; 4643 } 4644 4645 // Copy a QQQQ register quad by copying the individual sub-registers. 4646 if (AArch64::QQQQRegClass.contains(DestReg) && 4647 AArch64::QQQQRegClass.contains(SrcReg)) { 4648 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4649 AArch64::qsub2, AArch64::qsub3}; 4650 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4651 Indices); 4652 return; 4653 } 4654 4655 // Copy a QQQ register triple by copying the individual sub-registers. 4656 if (AArch64::QQQRegClass.contains(DestReg) && 4657 AArch64::QQQRegClass.contains(SrcReg)) { 4658 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4659 AArch64::qsub2}; 4660 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4661 Indices); 4662 return; 4663 } 4664 4665 // Copy a QQ register pair by copying the individual sub-registers. 4666 if (AArch64::QQRegClass.contains(DestReg) && 4667 AArch64::QQRegClass.contains(SrcReg)) { 4668 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 4669 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4670 Indices); 4671 return; 4672 } 4673 4674 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 4675 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 4676 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 4677 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 4678 AArch64::XZR, Indices); 4679 return; 4680 } 4681 4682 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 4683 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 4684 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 4685 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 4686 AArch64::WZR, Indices); 4687 return; 4688 } 4689 4690 if (AArch64::FPR128RegClass.contains(DestReg) && 4691 AArch64::FPR128RegClass.contains(SrcReg)) { 4692 if (Subtarget.isSVEorStreamingSVEAvailable() && 4693 !Subtarget.isNeonAvailable()) 4694 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 4695 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 4696 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 4697 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 4698 else if (Subtarget.isNeonAvailable()) 4699 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 4700 .addReg(SrcReg) 4701 .addReg(SrcReg, getKillRegState(KillSrc)); 4702 else { 4703 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 4704 .addReg(AArch64::SP, RegState::Define) 4705 .addReg(SrcReg, getKillRegState(KillSrc)) 4706 .addReg(AArch64::SP) 4707 .addImm(-16); 4708 BuildMI(MBB, I, DL, get(AArch64::LDRQpost)) 4709 .addReg(AArch64::SP, RegState::Define) 4710 .addReg(DestReg, RegState::Define) 4711 .addReg(AArch64::SP) 4712 .addImm(16); 4713 } 4714 return; 4715 } 4716 4717 if (AArch64::FPR64RegClass.contains(DestReg) && 4718 AArch64::FPR64RegClass.contains(SrcReg)) { 4719 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 4720 .addReg(SrcReg, getKillRegState(KillSrc)); 4721 return; 4722 } 4723 4724 if (AArch64::FPR32RegClass.contains(DestReg) && 4725 AArch64::FPR32RegClass.contains(SrcReg)) { 4726 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4727 .addReg(SrcReg, getKillRegState(KillSrc)); 4728 return; 4729 } 4730 4731 if (AArch64::FPR16RegClass.contains(DestReg) && 4732 AArch64::FPR16RegClass.contains(SrcReg)) { 4733 DestReg = 4734 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 4735 SrcReg = 4736 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 4737 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4738 .addReg(SrcReg, getKillRegState(KillSrc)); 4739 return; 4740 } 4741 4742 if (AArch64::FPR8RegClass.contains(DestReg) && 4743 AArch64::FPR8RegClass.contains(SrcReg)) { 4744 DestReg = 4745 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 4746 SrcReg = 4747 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 4748 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4749 .addReg(SrcReg, getKillRegState(KillSrc)); 4750 return; 4751 } 4752 4753 // Copies between GPR64 and FPR64. 4754 if (AArch64::FPR64RegClass.contains(DestReg) && 4755 AArch64::GPR64RegClass.contains(SrcReg)) { 4756 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 4757 .addReg(SrcReg, getKillRegState(KillSrc)); 4758 return; 4759 } 4760 if (AArch64::GPR64RegClass.contains(DestReg) && 4761 AArch64::FPR64RegClass.contains(SrcReg)) { 4762 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 4763 .addReg(SrcReg, getKillRegState(KillSrc)); 4764 return; 4765 } 4766 // Copies between GPR32 and FPR32. 4767 if (AArch64::FPR32RegClass.contains(DestReg) && 4768 AArch64::GPR32RegClass.contains(SrcReg)) { 4769 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 4770 .addReg(SrcReg, getKillRegState(KillSrc)); 4771 return; 4772 } 4773 if (AArch64::GPR32RegClass.contains(DestReg) && 4774 AArch64::FPR32RegClass.contains(SrcReg)) { 4775 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 4776 .addReg(SrcReg, getKillRegState(KillSrc)); 4777 return; 4778 } 4779 4780 if (DestReg == AArch64::NZCV) { 4781 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 4782 BuildMI(MBB, I, DL, get(AArch64::MSR)) 4783 .addImm(AArch64SysReg::NZCV) 4784 .addReg(SrcReg, getKillRegState(KillSrc)) 4785 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 4786 return; 4787 } 4788 4789 if (SrcReg == AArch64::NZCV) { 4790 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 4791 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 4792 .addImm(AArch64SysReg::NZCV) 4793 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 4794 return; 4795 } 4796 4797 #ifndef NDEBUG 4798 const TargetRegisterInfo &TRI = getRegisterInfo(); 4799 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 4800 << TRI.getRegAsmName(SrcReg) << "\n"; 4801 #endif 4802 llvm_unreachable("unimplemented reg-to-reg copy"); 4803 } 4804 4805 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 4806 MachineBasicBlock &MBB, 4807 MachineBasicBlock::iterator InsertBefore, 4808 const MCInstrDesc &MCID, 4809 Register SrcReg, bool IsKill, 4810 unsigned SubIdx0, unsigned SubIdx1, int FI, 4811 MachineMemOperand *MMO) { 4812 Register SrcReg0 = SrcReg; 4813 Register SrcReg1 = SrcReg; 4814 if (SrcReg.isPhysical()) { 4815 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 4816 SubIdx0 = 0; 4817 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 4818 SubIdx1 = 0; 4819 } 4820 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4821 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 4822 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 4823 .addFrameIndex(FI) 4824 .addImm(0) 4825 .addMemOperand(MMO); 4826 } 4827 4828 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 4829 MachineBasicBlock::iterator MBBI, 4830 Register SrcReg, bool isKill, int FI, 4831 const TargetRegisterClass *RC, 4832 const TargetRegisterInfo *TRI, 4833 Register VReg) const { 4834 MachineFunction &MF = *MBB.getParent(); 4835 MachineFrameInfo &MFI = MF.getFrameInfo(); 4836 4837 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4838 MachineMemOperand *MMO = 4839 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 4840 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4841 unsigned Opc = 0; 4842 bool Offset = true; 4843 MCRegister PNRReg = MCRegister::NoRegister; 4844 unsigned StackID = TargetStackID::Default; 4845 switch (TRI->getSpillSize(*RC)) { 4846 case 1: 4847 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4848 Opc = AArch64::STRBui; 4849 break; 4850 case 2: { 4851 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4852 Opc = AArch64::STRHui; 4853 else if (AArch64::PNRRegClass.hasSubClassEq(RC) || 4854 AArch64::PPRRegClass.hasSubClassEq(RC)) { 4855 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4856 "Unexpected register store without SVE store instructions"); 4857 Opc = AArch64::STR_PXI; 4858 StackID = TargetStackID::ScalableVector; 4859 } 4860 break; 4861 } 4862 case 4: 4863 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4864 Opc = AArch64::STRWui; 4865 if (SrcReg.isVirtual()) 4866 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 4867 else 4868 assert(SrcReg != AArch64::WSP); 4869 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4870 Opc = AArch64::STRSui; 4871 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 4872 Opc = AArch64::STR_PPXI; 4873 StackID = TargetStackID::ScalableVector; 4874 } 4875 break; 4876 case 8: 4877 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4878 Opc = AArch64::STRXui; 4879 if (SrcReg.isVirtual()) 4880 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4881 else 4882 assert(SrcReg != AArch64::SP); 4883 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4884 Opc = AArch64::STRDui; 4885 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4886 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4887 get(AArch64::STPWi), SrcReg, isKill, 4888 AArch64::sube32, AArch64::subo32, FI, MMO); 4889 return; 4890 } 4891 break; 4892 case 16: 4893 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4894 Opc = AArch64::STRQui; 4895 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4896 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4897 Opc = AArch64::ST1Twov1d; 4898 Offset = false; 4899 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4900 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4901 get(AArch64::STPXi), SrcReg, isKill, 4902 AArch64::sube64, AArch64::subo64, FI, MMO); 4903 return; 4904 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4905 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4906 "Unexpected register store without SVE store instructions"); 4907 Opc = AArch64::STR_ZXI; 4908 StackID = TargetStackID::ScalableVector; 4909 } 4910 break; 4911 case 24: 4912 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4913 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4914 Opc = AArch64::ST1Threev1d; 4915 Offset = false; 4916 } 4917 break; 4918 case 32: 4919 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4920 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4921 Opc = AArch64::ST1Fourv1d; 4922 Offset = false; 4923 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4924 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4925 Opc = AArch64::ST1Twov2d; 4926 Offset = false; 4927 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 4928 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4929 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4930 "Unexpected register store without SVE store instructions"); 4931 Opc = AArch64::STR_ZZXI; 4932 StackID = TargetStackID::ScalableVector; 4933 } 4934 break; 4935 case 48: 4936 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4937 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4938 Opc = AArch64::ST1Threev2d; 4939 Offset = false; 4940 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4941 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4942 "Unexpected register store without SVE store instructions"); 4943 Opc = AArch64::STR_ZZZXI; 4944 StackID = TargetStackID::ScalableVector; 4945 } 4946 break; 4947 case 64: 4948 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4949 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4950 Opc = AArch64::ST1Fourv2d; 4951 Offset = false; 4952 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 4953 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4954 assert(Subtarget.isSVEorStreamingSVEAvailable() && 4955 "Unexpected register store without SVE store instructions"); 4956 Opc = AArch64::STR_ZZZZXI; 4957 StackID = TargetStackID::ScalableVector; 4958 } 4959 break; 4960 } 4961 assert(Opc && "Unknown register class"); 4962 MFI.setStackID(FI, StackID); 4963 4964 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4965 .addReg(SrcReg, getKillRegState(isKill)) 4966 .addFrameIndex(FI); 4967 4968 if (Offset) 4969 MI.addImm(0); 4970 if (PNRReg.isValid()) 4971 MI.addDef(PNRReg, RegState::Implicit); 4972 MI.addMemOperand(MMO); 4973 } 4974 4975 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 4976 MachineBasicBlock &MBB, 4977 MachineBasicBlock::iterator InsertBefore, 4978 const MCInstrDesc &MCID, 4979 Register DestReg, unsigned SubIdx0, 4980 unsigned SubIdx1, int FI, 4981 MachineMemOperand *MMO) { 4982 Register DestReg0 = DestReg; 4983 Register DestReg1 = DestReg; 4984 bool IsUndef = true; 4985 if (DestReg.isPhysical()) { 4986 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 4987 SubIdx0 = 0; 4988 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 4989 SubIdx1 = 0; 4990 IsUndef = false; 4991 } 4992 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4993 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 4994 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 4995 .addFrameIndex(FI) 4996 .addImm(0) 4997 .addMemOperand(MMO); 4998 } 4999 5000 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 5001 MachineBasicBlock::iterator MBBI, 5002 Register DestReg, int FI, 5003 const TargetRegisterClass *RC, 5004 const TargetRegisterInfo *TRI, 5005 Register VReg) const { 5006 MachineFunction &MF = *MBB.getParent(); 5007 MachineFrameInfo &MFI = MF.getFrameInfo(); 5008 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 5009 MachineMemOperand *MMO = 5010 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 5011 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 5012 5013 unsigned Opc = 0; 5014 bool Offset = true; 5015 unsigned StackID = TargetStackID::Default; 5016 Register PNRReg = MCRegister::NoRegister; 5017 switch (TRI->getSpillSize(*RC)) { 5018 case 1: 5019 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 5020 Opc = AArch64::LDRBui; 5021 break; 5022 case 2: { 5023 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC); 5024 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 5025 Opc = AArch64::LDRHui; 5026 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) { 5027 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5028 "Unexpected register load without SVE load instructions"); 5029 if (IsPNR) 5030 PNRReg = DestReg; 5031 Opc = AArch64::LDR_PXI; 5032 StackID = TargetStackID::ScalableVector; 5033 } 5034 break; 5035 } 5036 case 4: 5037 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 5038 Opc = AArch64::LDRWui; 5039 if (DestReg.isVirtual()) 5040 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 5041 else 5042 assert(DestReg != AArch64::WSP); 5043 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5044 Opc = AArch64::LDRSui; 5045 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 5046 Opc = AArch64::LDR_PPXI; 5047 StackID = TargetStackID::ScalableVector; 5048 } 5049 break; 5050 case 8: 5051 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 5052 Opc = AArch64::LDRXui; 5053 if (DestReg.isVirtual()) 5054 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 5055 else 5056 assert(DestReg != AArch64::SP); 5057 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 5058 Opc = AArch64::LDRDui; 5059 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 5060 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5061 get(AArch64::LDPWi), DestReg, AArch64::sube32, 5062 AArch64::subo32, FI, MMO); 5063 return; 5064 } 5065 break; 5066 case 16: 5067 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 5068 Opc = AArch64::LDRQui; 5069 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 5070 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5071 Opc = AArch64::LD1Twov1d; 5072 Offset = false; 5073 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 5074 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5075 get(AArch64::LDPXi), DestReg, AArch64::sube64, 5076 AArch64::subo64, FI, MMO); 5077 return; 5078 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 5079 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5080 "Unexpected register load without SVE load instructions"); 5081 Opc = AArch64::LDR_ZXI; 5082 StackID = TargetStackID::ScalableVector; 5083 } 5084 break; 5085 case 24: 5086 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 5087 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5088 Opc = AArch64::LD1Threev1d; 5089 Offset = false; 5090 } 5091 break; 5092 case 32: 5093 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 5094 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5095 Opc = AArch64::LD1Fourv1d; 5096 Offset = false; 5097 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 5098 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5099 Opc = AArch64::LD1Twov2d; 5100 Offset = false; 5101 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 5102 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5103 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5104 "Unexpected register load without SVE load instructions"); 5105 Opc = AArch64::LDR_ZZXI; 5106 StackID = TargetStackID::ScalableVector; 5107 } 5108 break; 5109 case 48: 5110 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 5111 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5112 Opc = AArch64::LD1Threev2d; 5113 Offset = false; 5114 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 5115 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5116 "Unexpected register load without SVE load instructions"); 5117 Opc = AArch64::LDR_ZZZXI; 5118 StackID = TargetStackID::ScalableVector; 5119 } 5120 break; 5121 case 64: 5122 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 5123 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5124 Opc = AArch64::LD1Fourv2d; 5125 Offset = false; 5126 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 5127 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5128 assert(Subtarget.isSVEorStreamingSVEAvailable() && 5129 "Unexpected register load without SVE load instructions"); 5130 Opc = AArch64::LDR_ZZZZXI; 5131 StackID = TargetStackID::ScalableVector; 5132 } 5133 break; 5134 } 5135 5136 assert(Opc && "Unknown register class"); 5137 MFI.setStackID(FI, StackID); 5138 5139 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 5140 .addReg(DestReg, getDefRegState(true)) 5141 .addFrameIndex(FI); 5142 if (Offset) 5143 MI.addImm(0); 5144 if (PNRReg.isValid() && !PNRReg.isVirtual()) 5145 MI.addDef(PNRReg, RegState::Implicit); 5146 MI.addMemOperand(MMO); 5147 } 5148 5149 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 5150 const MachineInstr &UseMI, 5151 const TargetRegisterInfo *TRI) { 5152 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 5153 UseMI.getIterator()), 5154 [TRI](const MachineInstr &I) { 5155 return I.modifiesRegister(AArch64::NZCV, TRI) || 5156 I.readsRegister(AArch64::NZCV, TRI); 5157 }); 5158 } 5159 5160 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5161 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 5162 // The smallest scalable element supported by scaled SVE addressing 5163 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5164 // byte offset must always be a multiple of 2. 5165 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5166 5167 // VGSized offsets are divided by '2', because the VG register is the 5168 // the number of 64bit granules as opposed to 128bit vector chunks, 5169 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 5170 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 5171 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 5172 ByteSized = Offset.getFixed(); 5173 VGSized = Offset.getScalable() / 2; 5174 } 5175 5176 /// Returns the offset in parts to which this frame offset can be 5177 /// decomposed for the purpose of describing a frame offset. 5178 /// For non-scalable offsets this is simply its byte size. 5179 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5180 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 5181 int64_t &NumDataVectors) { 5182 // The smallest scalable element supported by scaled SVE addressing 5183 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5184 // byte offset must always be a multiple of 2. 5185 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5186 5187 NumBytes = Offset.getFixed(); 5188 NumDataVectors = 0; 5189 NumPredicateVectors = Offset.getScalable() / 2; 5190 // This method is used to get the offsets to adjust the frame offset. 5191 // If the function requires ADDPL to be used and needs more than two ADDPL 5192 // instructions, part of the offset is folded into NumDataVectors so that it 5193 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 5194 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 5195 NumPredicateVectors > 62) { 5196 NumDataVectors = NumPredicateVectors / 8; 5197 NumPredicateVectors -= NumDataVectors * 8; 5198 } 5199 } 5200 5201 // Convenience function to create a DWARF expression for 5202 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 5203 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 5204 int NumVGScaledBytes, unsigned VG, 5205 llvm::raw_string_ostream &Comment) { 5206 uint8_t buffer[16]; 5207 5208 if (NumBytes) { 5209 Expr.push_back(dwarf::DW_OP_consts); 5210 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 5211 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5212 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 5213 } 5214 5215 if (NumVGScaledBytes) { 5216 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 5217 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 5218 5219 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 5220 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 5221 Expr.push_back(0); 5222 5223 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 5224 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5225 5226 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 5227 << std::abs(NumVGScaledBytes) << " * VG"; 5228 } 5229 } 5230 5231 // Creates an MCCFIInstruction: 5232 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 5233 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 5234 unsigned Reg, 5235 const StackOffset &Offset) { 5236 int64_t NumBytes, NumVGScaledBytes; 5237 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 5238 NumVGScaledBytes); 5239 std::string CommentBuffer; 5240 llvm::raw_string_ostream Comment(CommentBuffer); 5241 5242 if (Reg == AArch64::SP) 5243 Comment << "sp"; 5244 else if (Reg == AArch64::FP) 5245 Comment << "fp"; 5246 else 5247 Comment << printReg(Reg, &TRI); 5248 5249 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 5250 SmallString<64> Expr; 5251 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5252 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 5253 Expr.push_back(0); 5254 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 5255 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5256 5257 // Wrap this into DW_CFA_def_cfa. 5258 SmallString<64> DefCfaExpr; 5259 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 5260 uint8_t buffer[16]; 5261 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 5262 DefCfaExpr.append(Expr.str()); 5263 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 5264 Comment.str()); 5265 } 5266 5267 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 5268 unsigned FrameReg, unsigned Reg, 5269 const StackOffset &Offset, 5270 bool LastAdjustmentWasScalable) { 5271 if (Offset.getScalable()) 5272 return createDefCFAExpression(TRI, Reg, Offset); 5273 5274 if (FrameReg == Reg && !LastAdjustmentWasScalable) 5275 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 5276 5277 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5278 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 5279 } 5280 5281 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 5282 unsigned Reg, 5283 const StackOffset &OffsetFromDefCFA) { 5284 int64_t NumBytes, NumVGScaledBytes; 5285 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5286 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 5287 5288 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5289 5290 // Non-scalable offsets can use DW_CFA_offset directly. 5291 if (!NumVGScaledBytes) 5292 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 5293 5294 std::string CommentBuffer; 5295 llvm::raw_string_ostream Comment(CommentBuffer); 5296 Comment << printReg(Reg, &TRI) << " @ cfa"; 5297 5298 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 5299 SmallString<64> OffsetExpr; 5300 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 5301 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5302 5303 // Wrap this into DW_CFA_expression 5304 SmallString<64> CfaExpr; 5305 CfaExpr.push_back(dwarf::DW_CFA_expression); 5306 uint8_t buffer[16]; 5307 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 5308 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 5309 CfaExpr.append(OffsetExpr.str()); 5310 5311 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 5312 Comment.str()); 5313 } 5314 5315 // Helper function to emit a frame offset adjustment from a given 5316 // pointer (SrcReg), stored into DestReg. This function is explicit 5317 // in that it requires the opcode. 5318 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 5319 MachineBasicBlock::iterator MBBI, 5320 const DebugLoc &DL, unsigned DestReg, 5321 unsigned SrcReg, int64_t Offset, unsigned Opc, 5322 const TargetInstrInfo *TII, 5323 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 5324 bool *HasWinCFI, bool EmitCFAOffset, 5325 StackOffset CFAOffset, unsigned FrameReg) { 5326 int Sign = 1; 5327 unsigned MaxEncoding, ShiftSize; 5328 switch (Opc) { 5329 case AArch64::ADDXri: 5330 case AArch64::ADDSXri: 5331 case AArch64::SUBXri: 5332 case AArch64::SUBSXri: 5333 MaxEncoding = 0xfff; 5334 ShiftSize = 12; 5335 break; 5336 case AArch64::ADDVL_XXI: 5337 case AArch64::ADDPL_XXI: 5338 case AArch64::ADDSVL_XXI: 5339 case AArch64::ADDSPL_XXI: 5340 MaxEncoding = 31; 5341 ShiftSize = 0; 5342 if (Offset < 0) { 5343 MaxEncoding = 32; 5344 Sign = -1; 5345 Offset = -Offset; 5346 } 5347 break; 5348 default: 5349 llvm_unreachable("Unsupported opcode"); 5350 } 5351 5352 // `Offset` can be in bytes or in "scalable bytes". 5353 int VScale = 1; 5354 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 5355 VScale = 16; 5356 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 5357 VScale = 2; 5358 5359 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 5360 // scratch register. If DestReg is a virtual register, use it as the 5361 // scratch register; otherwise, create a new virtual register (to be 5362 // replaced by the scavenger at the end of PEI). That case can be optimized 5363 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 5364 // register can be loaded with offset%8 and the add/sub can use an extending 5365 // instruction with LSL#3. 5366 // Currently the function handles any offsets but generates a poor sequence 5367 // of code. 5368 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 5369 5370 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 5371 Register TmpReg = DestReg; 5372 if (TmpReg == AArch64::XZR) 5373 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 5374 &AArch64::GPR64RegClass); 5375 do { 5376 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 5377 unsigned LocalShiftSize = 0; 5378 if (ThisVal > MaxEncoding) { 5379 ThisVal = ThisVal >> ShiftSize; 5380 LocalShiftSize = ShiftSize; 5381 } 5382 assert((ThisVal >> ShiftSize) <= MaxEncoding && 5383 "Encoding cannot handle value that big"); 5384 5385 Offset -= ThisVal << LocalShiftSize; 5386 if (Offset == 0) 5387 TmpReg = DestReg; 5388 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 5389 .addReg(SrcReg) 5390 .addImm(Sign * (int)ThisVal); 5391 if (ShiftSize) 5392 MBI = MBI.addImm( 5393 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 5394 MBI = MBI.setMIFlag(Flag); 5395 5396 auto Change = 5397 VScale == 1 5398 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 5399 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 5400 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 5401 CFAOffset += Change; 5402 else 5403 CFAOffset -= Change; 5404 if (EmitCFAOffset && DestReg == TmpReg) { 5405 MachineFunction &MF = *MBB.getParent(); 5406 const TargetSubtargetInfo &STI = MF.getSubtarget(); 5407 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 5408 5409 unsigned CFIIndex = MF.addFrameInst( 5410 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 5411 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 5412 .addCFIIndex(CFIIndex) 5413 .setMIFlags(Flag); 5414 } 5415 5416 if (NeedsWinCFI) { 5417 assert(Sign == 1 && "SEH directives should always have a positive sign"); 5418 int Imm = (int)(ThisVal << LocalShiftSize); 5419 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 5420 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 5421 if (HasWinCFI) 5422 *HasWinCFI = true; 5423 if (Imm == 0) 5424 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 5425 else 5426 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 5427 .addImm(Imm) 5428 .setMIFlag(Flag); 5429 assert(Offset == 0 && "Expected remaining offset to be zero to " 5430 "emit a single SEH directive"); 5431 } else if (DestReg == AArch64::SP) { 5432 if (HasWinCFI) 5433 *HasWinCFI = true; 5434 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 5435 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 5436 .addImm(Imm) 5437 .setMIFlag(Flag); 5438 } 5439 } 5440 5441 SrcReg = TmpReg; 5442 } while (Offset); 5443 } 5444 5445 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 5446 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 5447 unsigned DestReg, unsigned SrcReg, 5448 StackOffset Offset, const TargetInstrInfo *TII, 5449 MachineInstr::MIFlag Flag, bool SetNZCV, 5450 bool NeedsWinCFI, bool *HasWinCFI, 5451 bool EmitCFAOffset, StackOffset CFAOffset, 5452 unsigned FrameReg) { 5453 // If a function is marked as arm_locally_streaming, then the runtime value of 5454 // vscale in the prologue/epilogue is different the runtime value of vscale 5455 // in the function's body. To avoid having to consider multiple vscales, 5456 // we can use `addsvl` to allocate any scalable stack-slots, which under 5457 // most circumstances will be only locals, not callee-save slots. 5458 const Function &F = MBB.getParent()->getFunction(); 5459 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 5460 5461 int64_t Bytes, NumPredicateVectors, NumDataVectors; 5462 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5463 Offset, Bytes, NumPredicateVectors, NumDataVectors); 5464 5465 // First emit non-scalable frame offsets, or a simple 'mov'. 5466 if (Bytes || (!Offset && SrcReg != DestReg)) { 5467 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 5468 "SP increment/decrement not 8-byte aligned"); 5469 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 5470 if (Bytes < 0) { 5471 Bytes = -Bytes; 5472 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 5473 } 5474 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 5475 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 5476 FrameReg); 5477 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 5478 ? StackOffset::getFixed(-Bytes) 5479 : StackOffset::getFixed(Bytes); 5480 SrcReg = DestReg; 5481 FrameReg = DestReg; 5482 } 5483 5484 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 5485 "SetNZCV not supported with SVE vectors"); 5486 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 5487 "WinCFI not supported with SVE vectors"); 5488 5489 if (NumDataVectors) { 5490 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 5491 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 5492 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5493 CFAOffset, FrameReg); 5494 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 5495 SrcReg = DestReg; 5496 } 5497 5498 if (NumPredicateVectors) { 5499 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 5500 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 5501 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 5502 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5503 CFAOffset, FrameReg); 5504 } 5505 } 5506 5507 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 5508 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 5509 MachineBasicBlock::iterator InsertPt, int FrameIndex, 5510 LiveIntervals *LIS, VirtRegMap *VRM) const { 5511 // This is a bit of a hack. Consider this instruction: 5512 // 5513 // %0 = COPY %sp; GPR64all:%0 5514 // 5515 // We explicitly chose GPR64all for the virtual register so such a copy might 5516 // be eliminated by RegisterCoalescer. However, that may not be possible, and 5517 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 5518 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 5519 // 5520 // To prevent that, we are going to constrain the %0 register class here. 5521 if (MI.isFullCopy()) { 5522 Register DstReg = MI.getOperand(0).getReg(); 5523 Register SrcReg = MI.getOperand(1).getReg(); 5524 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 5525 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 5526 return nullptr; 5527 } 5528 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 5529 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 5530 return nullptr; 5531 } 5532 // Nothing can folded with copy from/to NZCV. 5533 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 5534 return nullptr; 5535 } 5536 5537 // Handle the case where a copy is being spilled or filled but the source 5538 // and destination register class don't match. For example: 5539 // 5540 // %0 = COPY %xzr; GPR64common:%0 5541 // 5542 // In this case we can still safely fold away the COPY and generate the 5543 // following spill code: 5544 // 5545 // STRXui %xzr, %stack.0 5546 // 5547 // This also eliminates spilled cross register class COPYs (e.g. between x and 5548 // d regs) of the same size. For example: 5549 // 5550 // %0 = COPY %1; GPR64:%0, FPR64:%1 5551 // 5552 // will be filled as 5553 // 5554 // LDRDui %0, fi<#0> 5555 // 5556 // instead of 5557 // 5558 // LDRXui %Temp, fi<#0> 5559 // %0 = FMOV %Temp 5560 // 5561 if (MI.isCopy() && Ops.size() == 1 && 5562 // Make sure we're only folding the explicit COPY defs/uses. 5563 (Ops[0] == 0 || Ops[0] == 1)) { 5564 bool IsSpill = Ops[0] == 0; 5565 bool IsFill = !IsSpill; 5566 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 5567 const MachineRegisterInfo &MRI = MF.getRegInfo(); 5568 MachineBasicBlock &MBB = *MI.getParent(); 5569 const MachineOperand &DstMO = MI.getOperand(0); 5570 const MachineOperand &SrcMO = MI.getOperand(1); 5571 Register DstReg = DstMO.getReg(); 5572 Register SrcReg = SrcMO.getReg(); 5573 // This is slightly expensive to compute for physical regs since 5574 // getMinimalPhysRegClass is slow. 5575 auto getRegClass = [&](unsigned Reg) { 5576 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 5577 : TRI.getMinimalPhysRegClass(Reg); 5578 }; 5579 5580 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 5581 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 5582 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 5583 "Mismatched register size in non subreg COPY"); 5584 if (IsSpill) 5585 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 5586 getRegClass(SrcReg), &TRI, Register()); 5587 else 5588 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 5589 getRegClass(DstReg), &TRI, Register()); 5590 return &*--InsertPt; 5591 } 5592 5593 // Handle cases like spilling def of: 5594 // 5595 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 5596 // 5597 // where the physical register source can be widened and stored to the full 5598 // virtual reg destination stack slot, in this case producing: 5599 // 5600 // STRXui %xzr, %stack.0 5601 // 5602 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR && 5603 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) { 5604 assert(SrcMO.getSubReg() == 0 && 5605 "Unexpected subreg on physical register"); 5606 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), 5607 FrameIndex, &AArch64::GPR64RegClass, &TRI, 5608 Register()); 5609 return &*--InsertPt; 5610 } 5611 5612 // Handle cases like filling use of: 5613 // 5614 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 5615 // 5616 // where we can load the full virtual reg source stack slot, into the subreg 5617 // destination, in this case producing: 5618 // 5619 // LDRWui %0:sub_32<def,read-undef>, %stack.0 5620 // 5621 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 5622 const TargetRegisterClass *FillRC; 5623 switch (DstMO.getSubReg()) { 5624 default: 5625 FillRC = nullptr; 5626 break; 5627 case AArch64::sub_32: 5628 FillRC = &AArch64::GPR32RegClass; 5629 break; 5630 case AArch64::ssub: 5631 FillRC = &AArch64::FPR32RegClass; 5632 break; 5633 case AArch64::dsub: 5634 FillRC = &AArch64::FPR64RegClass; 5635 break; 5636 } 5637 5638 if (FillRC) { 5639 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 5640 TRI.getRegSizeInBits(*FillRC) && 5641 "Mismatched regclass size on folded subreg COPY"); 5642 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 5643 Register()); 5644 MachineInstr &LoadMI = *--InsertPt; 5645 MachineOperand &LoadDst = LoadMI.getOperand(0); 5646 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 5647 LoadDst.setSubReg(DstMO.getSubReg()); 5648 LoadDst.setIsUndef(); 5649 return &LoadMI; 5650 } 5651 } 5652 } 5653 5654 // Cannot fold. 5655 return nullptr; 5656 } 5657 5658 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 5659 StackOffset &SOffset, 5660 bool *OutUseUnscaledOp, 5661 unsigned *OutUnscaledOp, 5662 int64_t *EmittableOffset) { 5663 // Set output values in case of early exit. 5664 if (EmittableOffset) 5665 *EmittableOffset = 0; 5666 if (OutUseUnscaledOp) 5667 *OutUseUnscaledOp = false; 5668 if (OutUnscaledOp) 5669 *OutUnscaledOp = 0; 5670 5671 // Exit early for structured vector spills/fills as they can't take an 5672 // immediate offset. 5673 switch (MI.getOpcode()) { 5674 default: 5675 break; 5676 case AArch64::LD1Rv1d: 5677 case AArch64::LD1Rv2s: 5678 case AArch64::LD1Rv2d: 5679 case AArch64::LD1Rv4h: 5680 case AArch64::LD1Rv4s: 5681 case AArch64::LD1Rv8b: 5682 case AArch64::LD1Rv8h: 5683 case AArch64::LD1Rv16b: 5684 case AArch64::LD1Twov2d: 5685 case AArch64::LD1Threev2d: 5686 case AArch64::LD1Fourv2d: 5687 case AArch64::LD1Twov1d: 5688 case AArch64::LD1Threev1d: 5689 case AArch64::LD1Fourv1d: 5690 case AArch64::ST1Twov2d: 5691 case AArch64::ST1Threev2d: 5692 case AArch64::ST1Fourv2d: 5693 case AArch64::ST1Twov1d: 5694 case AArch64::ST1Threev1d: 5695 case AArch64::ST1Fourv1d: 5696 case AArch64::ST1i8: 5697 case AArch64::ST1i16: 5698 case AArch64::ST1i32: 5699 case AArch64::ST1i64: 5700 case AArch64::IRG: 5701 case AArch64::IRGstack: 5702 case AArch64::STGloop: 5703 case AArch64::STZGloop: 5704 return AArch64FrameOffsetCannotUpdate; 5705 } 5706 5707 // Get the min/max offset and the scale. 5708 TypeSize ScaleValue(0U, false), Width(0U, false); 5709 int64_t MinOff, MaxOff; 5710 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 5711 MaxOff)) 5712 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5713 5714 // Construct the complete offset. 5715 bool IsMulVL = ScaleValue.isScalable(); 5716 unsigned Scale = ScaleValue.getKnownMinValue(); 5717 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 5718 5719 const MachineOperand &ImmOpnd = 5720 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 5721 Offset += ImmOpnd.getImm() * Scale; 5722 5723 // If the offset doesn't match the scale, we rewrite the instruction to 5724 // use the unscaled instruction instead. Likewise, if we have a negative 5725 // offset and there is an unscaled op to use. 5726 std::optional<unsigned> UnscaledOp = 5727 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 5728 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 5729 if (useUnscaledOp && 5730 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 5731 MaxOff)) 5732 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5733 5734 Scale = ScaleValue.getKnownMinValue(); 5735 assert(IsMulVL == ScaleValue.isScalable() && 5736 "Unscaled opcode has different value for scalable"); 5737 5738 int64_t Remainder = Offset % Scale; 5739 assert(!(Remainder && useUnscaledOp) && 5740 "Cannot have remainder when using unscaled op"); 5741 5742 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 5743 int64_t NewOffset = Offset / Scale; 5744 if (MinOff <= NewOffset && NewOffset <= MaxOff) 5745 Offset = Remainder; 5746 else { 5747 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 5748 Offset = Offset - (NewOffset * Scale); 5749 } 5750 5751 if (EmittableOffset) 5752 *EmittableOffset = NewOffset; 5753 if (OutUseUnscaledOp) 5754 *OutUseUnscaledOp = useUnscaledOp; 5755 if (OutUnscaledOp && UnscaledOp) 5756 *OutUnscaledOp = *UnscaledOp; 5757 5758 if (IsMulVL) 5759 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 5760 else 5761 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 5762 return AArch64FrameOffsetCanUpdate | 5763 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 5764 } 5765 5766 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 5767 unsigned FrameReg, StackOffset &Offset, 5768 const AArch64InstrInfo *TII) { 5769 unsigned Opcode = MI.getOpcode(); 5770 unsigned ImmIdx = FrameRegIdx + 1; 5771 5772 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 5773 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 5774 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 5775 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 5776 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 5777 MI.eraseFromParent(); 5778 Offset = StackOffset(); 5779 return true; 5780 } 5781 5782 int64_t NewOffset; 5783 unsigned UnscaledOp; 5784 bool UseUnscaledOp; 5785 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 5786 &UnscaledOp, &NewOffset); 5787 if (Status & AArch64FrameOffsetCanUpdate) { 5788 if (Status & AArch64FrameOffsetIsLegal) 5789 // Replace the FrameIndex with FrameReg. 5790 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 5791 if (UseUnscaledOp) 5792 MI.setDesc(TII->get(UnscaledOp)); 5793 5794 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 5795 return !Offset; 5796 } 5797 5798 return false; 5799 } 5800 5801 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB, 5802 MachineBasicBlock::iterator MI) const { 5803 DebugLoc DL; 5804 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0); 5805 } 5806 5807 MCInst AArch64InstrInfo::getNop() const { 5808 return MCInstBuilder(AArch64::HINT).addImm(0); 5809 } 5810 5811 // AArch64 supports MachineCombiner. 5812 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 5813 5814 // True when Opc sets flag 5815 static bool isCombineInstrSettingFlag(unsigned Opc) { 5816 switch (Opc) { 5817 case AArch64::ADDSWrr: 5818 case AArch64::ADDSWri: 5819 case AArch64::ADDSXrr: 5820 case AArch64::ADDSXri: 5821 case AArch64::SUBSWrr: 5822 case AArch64::SUBSXrr: 5823 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5824 case AArch64::SUBSWri: 5825 case AArch64::SUBSXri: 5826 return true; 5827 default: 5828 break; 5829 } 5830 return false; 5831 } 5832 5833 // 32b Opcodes that can be combined with a MUL 5834 static bool isCombineInstrCandidate32(unsigned Opc) { 5835 switch (Opc) { 5836 case AArch64::ADDWrr: 5837 case AArch64::ADDWri: 5838 case AArch64::SUBWrr: 5839 case AArch64::ADDSWrr: 5840 case AArch64::ADDSWri: 5841 case AArch64::SUBSWrr: 5842 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5843 case AArch64::SUBWri: 5844 case AArch64::SUBSWri: 5845 return true; 5846 default: 5847 break; 5848 } 5849 return false; 5850 } 5851 5852 // 64b Opcodes that can be combined with a MUL 5853 static bool isCombineInstrCandidate64(unsigned Opc) { 5854 switch (Opc) { 5855 case AArch64::ADDXrr: 5856 case AArch64::ADDXri: 5857 case AArch64::SUBXrr: 5858 case AArch64::ADDSXrr: 5859 case AArch64::ADDSXri: 5860 case AArch64::SUBSXrr: 5861 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5862 case AArch64::SUBXri: 5863 case AArch64::SUBSXri: 5864 case AArch64::ADDv8i8: 5865 case AArch64::ADDv16i8: 5866 case AArch64::ADDv4i16: 5867 case AArch64::ADDv8i16: 5868 case AArch64::ADDv2i32: 5869 case AArch64::ADDv4i32: 5870 case AArch64::SUBv8i8: 5871 case AArch64::SUBv16i8: 5872 case AArch64::SUBv4i16: 5873 case AArch64::SUBv8i16: 5874 case AArch64::SUBv2i32: 5875 case AArch64::SUBv4i32: 5876 return true; 5877 default: 5878 break; 5879 } 5880 return false; 5881 } 5882 5883 // FP Opcodes that can be combined with a FMUL. 5884 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 5885 switch (Inst.getOpcode()) { 5886 default: 5887 break; 5888 case AArch64::FADDHrr: 5889 case AArch64::FADDSrr: 5890 case AArch64::FADDDrr: 5891 case AArch64::FADDv4f16: 5892 case AArch64::FADDv8f16: 5893 case AArch64::FADDv2f32: 5894 case AArch64::FADDv2f64: 5895 case AArch64::FADDv4f32: 5896 case AArch64::FSUBHrr: 5897 case AArch64::FSUBSrr: 5898 case AArch64::FSUBDrr: 5899 case AArch64::FSUBv4f16: 5900 case AArch64::FSUBv8f16: 5901 case AArch64::FSUBv2f32: 5902 case AArch64::FSUBv2f64: 5903 case AArch64::FSUBv4f32: 5904 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 5905 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 5906 // the target options or if FADD/FSUB has the contract fast-math flag. 5907 return Options.UnsafeFPMath || 5908 Options.AllowFPOpFusion == FPOpFusion::Fast || 5909 Inst.getFlag(MachineInstr::FmContract); 5910 return true; 5911 } 5912 return false; 5913 } 5914 5915 // Opcodes that can be combined with a MUL 5916 static bool isCombineInstrCandidate(unsigned Opc) { 5917 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 5918 } 5919 5920 // 5921 // Utility routine that checks if \param MO is defined by an 5922 // \param CombineOpc instruction in the basic block \param MBB 5923 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 5924 unsigned CombineOpc, unsigned ZeroReg = 0, 5925 bool CheckZeroReg = false) { 5926 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5927 MachineInstr *MI = nullptr; 5928 5929 if (MO.isReg() && MO.getReg().isVirtual()) 5930 MI = MRI.getUniqueVRegDef(MO.getReg()); 5931 // And it needs to be in the trace (otherwise, it won't have a depth). 5932 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 5933 return false; 5934 // Must only used by the user we combine with. 5935 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 5936 return false; 5937 5938 if (CheckZeroReg) { 5939 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 5940 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 5941 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 5942 // The third input reg must be zero. 5943 if (MI->getOperand(3).getReg() != ZeroReg) 5944 return false; 5945 } 5946 5947 if (isCombineInstrSettingFlag(CombineOpc) && 5948 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1) 5949 return false; 5950 5951 return true; 5952 } 5953 5954 // 5955 // Is \param MO defined by an integer multiply and can be combined? 5956 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5957 unsigned MulOpc, unsigned ZeroReg) { 5958 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 5959 } 5960 5961 // 5962 // Is \param MO defined by a floating-point multiply and can be combined? 5963 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5964 unsigned MulOpc) { 5965 return canCombine(MBB, MO, MulOpc); 5966 } 5967 5968 // TODO: There are many more machine instruction opcodes to match: 5969 // 1. Other data types (integer, vectors) 5970 // 2. Other math / logic operations (xor, or) 5971 // 3. Other forms of the same operation (intrinsics and other variants) 5972 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 5973 bool Invert) const { 5974 if (Invert) 5975 return false; 5976 switch (Inst.getOpcode()) { 5977 // == Floating-point types == 5978 // -- Floating-point instructions -- 5979 case AArch64::FADDHrr: 5980 case AArch64::FADDSrr: 5981 case AArch64::FADDDrr: 5982 case AArch64::FMULHrr: 5983 case AArch64::FMULSrr: 5984 case AArch64::FMULDrr: 5985 case AArch64::FMULX16: 5986 case AArch64::FMULX32: 5987 case AArch64::FMULX64: 5988 // -- Advanced SIMD instructions -- 5989 case AArch64::FADDv4f16: 5990 case AArch64::FADDv8f16: 5991 case AArch64::FADDv2f32: 5992 case AArch64::FADDv4f32: 5993 case AArch64::FADDv2f64: 5994 case AArch64::FMULv4f16: 5995 case AArch64::FMULv8f16: 5996 case AArch64::FMULv2f32: 5997 case AArch64::FMULv4f32: 5998 case AArch64::FMULv2f64: 5999 case AArch64::FMULXv4f16: 6000 case AArch64::FMULXv8f16: 6001 case AArch64::FMULXv2f32: 6002 case AArch64::FMULXv4f32: 6003 case AArch64::FMULXv2f64: 6004 // -- SVE instructions -- 6005 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 6006 // in the SVE instruction set (though there are predicated ones). 6007 case AArch64::FADD_ZZZ_H: 6008 case AArch64::FADD_ZZZ_S: 6009 case AArch64::FADD_ZZZ_D: 6010 case AArch64::FMUL_ZZZ_H: 6011 case AArch64::FMUL_ZZZ_S: 6012 case AArch64::FMUL_ZZZ_D: 6013 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 6014 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 6015 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 6016 6017 // == Integer types == 6018 // -- Base instructions -- 6019 // Opcodes MULWrr and MULXrr don't exist because 6020 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 6021 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 6022 // The machine-combiner does not support three-source-operands machine 6023 // instruction. So we cannot reassociate MULs. 6024 case AArch64::ADDWrr: 6025 case AArch64::ADDXrr: 6026 case AArch64::ANDWrr: 6027 case AArch64::ANDXrr: 6028 case AArch64::ORRWrr: 6029 case AArch64::ORRXrr: 6030 case AArch64::EORWrr: 6031 case AArch64::EORXrr: 6032 case AArch64::EONWrr: 6033 case AArch64::EONXrr: 6034 // -- Advanced SIMD instructions -- 6035 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 6036 // in the Advanced SIMD instruction set. 6037 case AArch64::ADDv8i8: 6038 case AArch64::ADDv16i8: 6039 case AArch64::ADDv4i16: 6040 case AArch64::ADDv8i16: 6041 case AArch64::ADDv2i32: 6042 case AArch64::ADDv4i32: 6043 case AArch64::ADDv1i64: 6044 case AArch64::ADDv2i64: 6045 case AArch64::MULv8i8: 6046 case AArch64::MULv16i8: 6047 case AArch64::MULv4i16: 6048 case AArch64::MULv8i16: 6049 case AArch64::MULv2i32: 6050 case AArch64::MULv4i32: 6051 case AArch64::ANDv8i8: 6052 case AArch64::ANDv16i8: 6053 case AArch64::ORRv8i8: 6054 case AArch64::ORRv16i8: 6055 case AArch64::EORv8i8: 6056 case AArch64::EORv16i8: 6057 // -- SVE instructions -- 6058 case AArch64::ADD_ZZZ_B: 6059 case AArch64::ADD_ZZZ_H: 6060 case AArch64::ADD_ZZZ_S: 6061 case AArch64::ADD_ZZZ_D: 6062 case AArch64::MUL_ZZZ_B: 6063 case AArch64::MUL_ZZZ_H: 6064 case AArch64::MUL_ZZZ_S: 6065 case AArch64::MUL_ZZZ_D: 6066 case AArch64::AND_ZZZ: 6067 case AArch64::ORR_ZZZ: 6068 case AArch64::EOR_ZZZ: 6069 return true; 6070 6071 default: 6072 return false; 6073 } 6074 } 6075 6076 /// Find instructions that can be turned into madd. 6077 static bool getMaddPatterns(MachineInstr &Root, 6078 SmallVectorImpl<unsigned> &Patterns) { 6079 unsigned Opc = Root.getOpcode(); 6080 MachineBasicBlock &MBB = *Root.getParent(); 6081 bool Found = false; 6082 6083 if (!isCombineInstrCandidate(Opc)) 6084 return false; 6085 if (isCombineInstrSettingFlag(Opc)) { 6086 int Cmp_NZCV = 6087 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true); 6088 // When NZCV is live bail out. 6089 if (Cmp_NZCV == -1) 6090 return false; 6091 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 6092 // When opcode can't change bail out. 6093 // CHECKME: do we miss any cases for opcode conversion? 6094 if (NewOpc == Opc) 6095 return false; 6096 Opc = NewOpc; 6097 } 6098 6099 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 6100 unsigned Pattern) { 6101 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 6102 Patterns.push_back(Pattern); 6103 Found = true; 6104 } 6105 }; 6106 6107 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) { 6108 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 6109 Patterns.push_back(Pattern); 6110 Found = true; 6111 } 6112 }; 6113 6114 typedef AArch64MachineCombinerPattern MCP; 6115 6116 switch (Opc) { 6117 default: 6118 break; 6119 case AArch64::ADDWrr: 6120 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6121 "ADDWrr does not have register operands"); 6122 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 6123 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 6124 break; 6125 case AArch64::ADDXrr: 6126 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 6127 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 6128 break; 6129 case AArch64::SUBWrr: 6130 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 6131 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 6132 break; 6133 case AArch64::SUBXrr: 6134 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 6135 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 6136 break; 6137 case AArch64::ADDWri: 6138 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 6139 break; 6140 case AArch64::ADDXri: 6141 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 6142 break; 6143 case AArch64::SUBWri: 6144 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 6145 break; 6146 case AArch64::SUBXri: 6147 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 6148 break; 6149 case AArch64::ADDv8i8: 6150 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 6151 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 6152 break; 6153 case AArch64::ADDv16i8: 6154 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 6155 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 6156 break; 6157 case AArch64::ADDv4i16: 6158 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 6159 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 6160 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 6161 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 6162 break; 6163 case AArch64::ADDv8i16: 6164 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 6165 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 6166 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 6167 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 6168 break; 6169 case AArch64::ADDv2i32: 6170 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 6171 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 6172 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 6173 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 6174 break; 6175 case AArch64::ADDv4i32: 6176 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 6177 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 6178 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 6179 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 6180 break; 6181 case AArch64::SUBv8i8: 6182 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 6183 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 6184 break; 6185 case AArch64::SUBv16i8: 6186 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 6187 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 6188 break; 6189 case AArch64::SUBv4i16: 6190 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 6191 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 6192 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 6193 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 6194 break; 6195 case AArch64::SUBv8i16: 6196 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 6197 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 6198 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 6199 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 6200 break; 6201 case AArch64::SUBv2i32: 6202 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 6203 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 6204 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 6205 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 6206 break; 6207 case AArch64::SUBv4i32: 6208 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 6209 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 6210 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 6211 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 6212 break; 6213 } 6214 return Found; 6215 } 6216 /// Floating-Point Support 6217 6218 /// Find instructions that can be turned into madd. 6219 static bool getFMAPatterns(MachineInstr &Root, 6220 SmallVectorImpl<unsigned> &Patterns) { 6221 6222 if (!isCombineInstrCandidateFP(Root)) 6223 return false; 6224 6225 MachineBasicBlock &MBB = *Root.getParent(); 6226 bool Found = false; 6227 6228 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool { 6229 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 6230 Patterns.push_back(Pattern); 6231 return true; 6232 } 6233 return false; 6234 }; 6235 6236 typedef AArch64MachineCombinerPattern MCP; 6237 6238 switch (Root.getOpcode()) { 6239 default: 6240 assert(false && "Unsupported FP instruction in combiner\n"); 6241 break; 6242 case AArch64::FADDHrr: 6243 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6244 "FADDHrr does not have register operands"); 6245 6246 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 6247 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 6248 break; 6249 case AArch64::FADDSrr: 6250 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6251 "FADDSrr does not have register operands"); 6252 6253 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 6254 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 6255 6256 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 6257 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 6258 break; 6259 case AArch64::FADDDrr: 6260 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 6261 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 6262 6263 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 6264 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 6265 break; 6266 case AArch64::FADDv4f16: 6267 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 6268 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 6269 6270 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 6271 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 6272 break; 6273 case AArch64::FADDv8f16: 6274 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 6275 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 6276 6277 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 6278 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 6279 break; 6280 case AArch64::FADDv2f32: 6281 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 6282 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 6283 6284 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 6285 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 6286 break; 6287 case AArch64::FADDv2f64: 6288 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 6289 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 6290 6291 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 6292 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 6293 break; 6294 case AArch64::FADDv4f32: 6295 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 6296 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 6297 6298 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 6299 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 6300 break; 6301 case AArch64::FSUBHrr: 6302 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 6303 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 6304 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 6305 break; 6306 case AArch64::FSUBSrr: 6307 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 6308 6309 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 6310 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 6311 6312 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 6313 break; 6314 case AArch64::FSUBDrr: 6315 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 6316 6317 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 6318 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 6319 6320 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 6321 break; 6322 case AArch64::FSUBv4f16: 6323 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 6324 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 6325 6326 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 6327 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 6328 break; 6329 case AArch64::FSUBv8f16: 6330 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 6331 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 6332 6333 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 6334 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 6335 break; 6336 case AArch64::FSUBv2f32: 6337 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 6338 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 6339 6340 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 6341 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 6342 break; 6343 case AArch64::FSUBv2f64: 6344 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 6345 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 6346 6347 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 6348 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 6349 break; 6350 case AArch64::FSUBv4f32: 6351 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 6352 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 6353 6354 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 6355 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 6356 break; 6357 } 6358 return Found; 6359 } 6360 6361 static bool getFMULPatterns(MachineInstr &Root, 6362 SmallVectorImpl<unsigned> &Patterns) { 6363 MachineBasicBlock &MBB = *Root.getParent(); 6364 bool Found = false; 6365 6366 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool { 6367 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6368 MachineOperand &MO = Root.getOperand(Operand); 6369 MachineInstr *MI = nullptr; 6370 if (MO.isReg() && MO.getReg().isVirtual()) 6371 MI = MRI.getUniqueVRegDef(MO.getReg()); 6372 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 6373 if (MI && MI->getOpcode() == TargetOpcode::COPY && 6374 MI->getOperand(1).getReg().isVirtual()) 6375 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 6376 if (MI && MI->getOpcode() == Opcode) { 6377 Patterns.push_back(Pattern); 6378 return true; 6379 } 6380 return false; 6381 }; 6382 6383 typedef AArch64MachineCombinerPattern MCP; 6384 6385 switch (Root.getOpcode()) { 6386 default: 6387 return false; 6388 case AArch64::FMULv2f32: 6389 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 6390 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 6391 break; 6392 case AArch64::FMULv2f64: 6393 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 6394 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 6395 break; 6396 case AArch64::FMULv4f16: 6397 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 6398 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 6399 break; 6400 case AArch64::FMULv4f32: 6401 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 6402 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 6403 break; 6404 case AArch64::FMULv8f16: 6405 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 6406 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 6407 break; 6408 } 6409 6410 return Found; 6411 } 6412 6413 static bool getFNEGPatterns(MachineInstr &Root, 6414 SmallVectorImpl<unsigned> &Patterns) { 6415 unsigned Opc = Root.getOpcode(); 6416 MachineBasicBlock &MBB = *Root.getParent(); 6417 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6418 6419 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool { 6420 MachineOperand &MO = Root.getOperand(1); 6421 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 6422 if (MI != nullptr && (MI->getOpcode() == Opcode) && 6423 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 6424 Root.getFlag(MachineInstr::MIFlag::FmContract) && 6425 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 6426 MI->getFlag(MachineInstr::MIFlag::FmContract) && 6427 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 6428 Patterns.push_back(Pattern); 6429 return true; 6430 } 6431 return false; 6432 }; 6433 6434 switch (Opc) { 6435 default: 6436 break; 6437 case AArch64::FNEGDr: 6438 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD); 6439 case AArch64::FNEGSr: 6440 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD); 6441 } 6442 6443 return false; 6444 } 6445 6446 /// Return true when a code sequence can improve throughput. It 6447 /// should be called only for instructions in loops. 6448 /// \param Pattern - combiner pattern 6449 bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { 6450 switch (Pattern) { 6451 default: 6452 break; 6453 case AArch64MachineCombinerPattern::FMULADDH_OP1: 6454 case AArch64MachineCombinerPattern::FMULADDH_OP2: 6455 case AArch64MachineCombinerPattern::FMULSUBH_OP1: 6456 case AArch64MachineCombinerPattern::FMULSUBH_OP2: 6457 case AArch64MachineCombinerPattern::FMULADDS_OP1: 6458 case AArch64MachineCombinerPattern::FMULADDS_OP2: 6459 case AArch64MachineCombinerPattern::FMULSUBS_OP1: 6460 case AArch64MachineCombinerPattern::FMULSUBS_OP2: 6461 case AArch64MachineCombinerPattern::FMULADDD_OP1: 6462 case AArch64MachineCombinerPattern::FMULADDD_OP2: 6463 case AArch64MachineCombinerPattern::FMULSUBD_OP1: 6464 case AArch64MachineCombinerPattern::FMULSUBD_OP2: 6465 case AArch64MachineCombinerPattern::FNMULSUBH_OP1: 6466 case AArch64MachineCombinerPattern::FNMULSUBS_OP1: 6467 case AArch64MachineCombinerPattern::FNMULSUBD_OP1: 6468 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6469 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6470 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6471 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6472 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6473 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6474 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6475 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6476 case AArch64MachineCombinerPattern::FMLAv4f16_OP2: 6477 case AArch64MachineCombinerPattern::FMLAv4f16_OP1: 6478 case AArch64MachineCombinerPattern::FMLAv8f16_OP1: 6479 case AArch64MachineCombinerPattern::FMLAv8f16_OP2: 6480 case AArch64MachineCombinerPattern::FMLAv2f32_OP2: 6481 case AArch64MachineCombinerPattern::FMLAv2f32_OP1: 6482 case AArch64MachineCombinerPattern::FMLAv2f64_OP1: 6483 case AArch64MachineCombinerPattern::FMLAv2f64_OP2: 6484 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6485 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6486 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6487 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6488 case AArch64MachineCombinerPattern::FMLAv4f32_OP1: 6489 case AArch64MachineCombinerPattern::FMLAv4f32_OP2: 6490 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6491 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6492 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: 6493 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6494 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: 6495 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6496 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6497 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6498 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6499 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6500 case AArch64MachineCombinerPattern::FMLSv4f16_OP1: 6501 case AArch64MachineCombinerPattern::FMLSv4f16_OP2: 6502 case AArch64MachineCombinerPattern::FMLSv8f16_OP1: 6503 case AArch64MachineCombinerPattern::FMLSv8f16_OP2: 6504 case AArch64MachineCombinerPattern::FMLSv2f32_OP2: 6505 case AArch64MachineCombinerPattern::FMLSv2f64_OP2: 6506 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6507 case AArch64MachineCombinerPattern::FMLSv4f32_OP2: 6508 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1: 6509 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: 6510 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1: 6511 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: 6512 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1: 6513 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: 6514 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1: 6515 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: 6516 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1: 6517 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: 6518 case AArch64MachineCombinerPattern::MULADDv8i8_OP1: 6519 case AArch64MachineCombinerPattern::MULADDv8i8_OP2: 6520 case AArch64MachineCombinerPattern::MULADDv16i8_OP1: 6521 case AArch64MachineCombinerPattern::MULADDv16i8_OP2: 6522 case AArch64MachineCombinerPattern::MULADDv4i16_OP1: 6523 case AArch64MachineCombinerPattern::MULADDv4i16_OP2: 6524 case AArch64MachineCombinerPattern::MULADDv8i16_OP1: 6525 case AArch64MachineCombinerPattern::MULADDv8i16_OP2: 6526 case AArch64MachineCombinerPattern::MULADDv2i32_OP1: 6527 case AArch64MachineCombinerPattern::MULADDv2i32_OP2: 6528 case AArch64MachineCombinerPattern::MULADDv4i32_OP1: 6529 case AArch64MachineCombinerPattern::MULADDv4i32_OP2: 6530 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1: 6531 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2: 6532 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1: 6533 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2: 6534 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1: 6535 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2: 6536 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1: 6537 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2: 6538 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1: 6539 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2: 6540 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1: 6541 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2: 6542 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6543 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6544 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6545 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6546 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6547 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6548 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6549 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6550 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6551 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6552 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6553 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6554 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6555 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6556 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6557 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6558 return true; 6559 } // end switch (Pattern) 6560 return false; 6561 } 6562 6563 /// Find other MI combine patterns. 6564 static bool getMiscPatterns(MachineInstr &Root, 6565 SmallVectorImpl<unsigned> &Patterns) { 6566 // A - (B + C) ==> (A - B) - C or (A - C) - B 6567 unsigned Opc = Root.getOpcode(); 6568 MachineBasicBlock &MBB = *Root.getParent(); 6569 6570 switch (Opc) { 6571 case AArch64::SUBWrr: 6572 case AArch64::SUBSWrr: 6573 case AArch64::SUBXrr: 6574 case AArch64::SUBSXrr: 6575 // Found candidate root. 6576 break; 6577 default: 6578 return false; 6579 } 6580 6581 if (isCombineInstrSettingFlag(Opc) && 6582 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == 6583 -1) 6584 return false; 6585 6586 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 6587 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 6588 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 6589 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 6590 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP1); 6591 Patterns.push_back(AArch64MachineCombinerPattern::SUBADD_OP2); 6592 return true; 6593 } 6594 6595 return false; 6596 } 6597 6598 CombinerObjective 6599 AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { 6600 switch (Pattern) { 6601 case AArch64MachineCombinerPattern::SUBADD_OP1: 6602 case AArch64MachineCombinerPattern::SUBADD_OP2: 6603 return CombinerObjective::MustReduceDepth; 6604 default: 6605 return TargetInstrInfo::getCombinerObjective(Pattern); 6606 } 6607 } 6608 6609 /// Return true when there is potentially a faster code sequence for an 6610 /// instruction chain ending in \p Root. All potential patterns are listed in 6611 /// the \p Pattern vector. Pattern should be sorted in priority order since the 6612 /// pattern evaluator stops checking as soon as it finds a faster sequence. 6613 6614 bool AArch64InstrInfo::getMachineCombinerPatterns( 6615 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns, 6616 bool DoRegPressureReduce) const { 6617 // Integer patterns 6618 if (getMaddPatterns(Root, Patterns)) 6619 return true; 6620 // Floating point patterns 6621 if (getFMULPatterns(Root, Patterns)) 6622 return true; 6623 if (getFMAPatterns(Root, Patterns)) 6624 return true; 6625 if (getFNEGPatterns(Root, Patterns)) 6626 return true; 6627 6628 // Other patterns 6629 if (getMiscPatterns(Root, Patterns)) 6630 return true; 6631 6632 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 6633 DoRegPressureReduce); 6634 } 6635 6636 enum class FMAInstKind { Default, Indexed, Accumulator }; 6637 /// genFusedMultiply - Generate fused multiply instructions. 6638 /// This function supports both integer and floating point instructions. 6639 /// A typical example: 6640 /// F|MUL I=A,B,0 6641 /// F|ADD R,I,C 6642 /// ==> F|MADD R,A,B,C 6643 /// \param MF Containing MachineFunction 6644 /// \param MRI Register information 6645 /// \param TII Target information 6646 /// \param Root is the F|ADD instruction 6647 /// \param [out] InsInstrs is a vector of machine instructions and will 6648 /// contain the generated madd instruction 6649 /// \param IdxMulOpd is index of operand in Root that is the result of 6650 /// the F|MUL. In the example above IdxMulOpd is 1. 6651 /// \param MaddOpc the opcode fo the f|madd instruction 6652 /// \param RC Register class of operands 6653 /// \param kind of fma instruction (addressing mode) to be generated 6654 /// \param ReplacedAddend is the result register from the instruction 6655 /// replacing the non-combined operand, if any. 6656 static MachineInstr * 6657 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 6658 const TargetInstrInfo *TII, MachineInstr &Root, 6659 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 6660 unsigned MaddOpc, const TargetRegisterClass *RC, 6661 FMAInstKind kind = FMAInstKind::Default, 6662 const Register *ReplacedAddend = nullptr) { 6663 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6664 6665 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 6666 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6667 Register ResultReg = Root.getOperand(0).getReg(); 6668 Register SrcReg0 = MUL->getOperand(1).getReg(); 6669 bool Src0IsKill = MUL->getOperand(1).isKill(); 6670 Register SrcReg1 = MUL->getOperand(2).getReg(); 6671 bool Src1IsKill = MUL->getOperand(2).isKill(); 6672 6673 Register SrcReg2; 6674 bool Src2IsKill; 6675 if (ReplacedAddend) { 6676 // If we just generated a new addend, we must be it's only use. 6677 SrcReg2 = *ReplacedAddend; 6678 Src2IsKill = true; 6679 } else { 6680 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 6681 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 6682 } 6683 6684 if (ResultReg.isVirtual()) 6685 MRI.constrainRegClass(ResultReg, RC); 6686 if (SrcReg0.isVirtual()) 6687 MRI.constrainRegClass(SrcReg0, RC); 6688 if (SrcReg1.isVirtual()) 6689 MRI.constrainRegClass(SrcReg1, RC); 6690 if (SrcReg2.isVirtual()) 6691 MRI.constrainRegClass(SrcReg2, RC); 6692 6693 MachineInstrBuilder MIB; 6694 if (kind == FMAInstKind::Default) 6695 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6696 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6697 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6698 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6699 else if (kind == FMAInstKind::Indexed) 6700 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6701 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6702 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6703 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6704 .addImm(MUL->getOperand(3).getImm()); 6705 else if (kind == FMAInstKind::Accumulator) 6706 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6707 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6708 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6709 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 6710 else 6711 assert(false && "Invalid FMA instruction kind \n"); 6712 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 6713 InsInstrs.push_back(MIB); 6714 return MUL; 6715 } 6716 6717 static MachineInstr * 6718 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 6719 const TargetInstrInfo *TII, MachineInstr &Root, 6720 SmallVectorImpl<MachineInstr *> &InsInstrs) { 6721 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 6722 6723 unsigned Opc = 0; 6724 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 6725 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 6726 Opc = AArch64::FNMADDSrrr; 6727 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 6728 Opc = AArch64::FNMADDDrrr; 6729 else 6730 return nullptr; 6731 6732 Register ResultReg = Root.getOperand(0).getReg(); 6733 Register SrcReg0 = MAD->getOperand(1).getReg(); 6734 Register SrcReg1 = MAD->getOperand(2).getReg(); 6735 Register SrcReg2 = MAD->getOperand(3).getReg(); 6736 bool Src0IsKill = MAD->getOperand(1).isKill(); 6737 bool Src1IsKill = MAD->getOperand(2).isKill(); 6738 bool Src2IsKill = MAD->getOperand(3).isKill(); 6739 if (ResultReg.isVirtual()) 6740 MRI.constrainRegClass(ResultReg, RC); 6741 if (SrcReg0.isVirtual()) 6742 MRI.constrainRegClass(SrcReg0, RC); 6743 if (SrcReg1.isVirtual()) 6744 MRI.constrainRegClass(SrcReg1, RC); 6745 if (SrcReg2.isVirtual()) 6746 MRI.constrainRegClass(SrcReg2, RC); 6747 6748 MachineInstrBuilder MIB = 6749 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 6750 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6751 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6752 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6753 InsInstrs.push_back(MIB); 6754 6755 return MAD; 6756 } 6757 6758 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 6759 static MachineInstr * 6760 genIndexedMultiply(MachineInstr &Root, 6761 SmallVectorImpl<MachineInstr *> &InsInstrs, 6762 unsigned IdxDupOp, unsigned MulOpc, 6763 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 6764 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 6765 "Invalid index of FMUL operand"); 6766 6767 MachineFunction &MF = *Root.getMF(); 6768 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6769 6770 MachineInstr *Dup = 6771 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 6772 6773 if (Dup->getOpcode() == TargetOpcode::COPY) 6774 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 6775 6776 Register DupSrcReg = Dup->getOperand(1).getReg(); 6777 MRI.clearKillFlags(DupSrcReg); 6778 MRI.constrainRegClass(DupSrcReg, RC); 6779 6780 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 6781 6782 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 6783 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 6784 6785 Register ResultReg = Root.getOperand(0).getReg(); 6786 6787 MachineInstrBuilder MIB; 6788 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 6789 .add(MulOp) 6790 .addReg(DupSrcReg) 6791 .addImm(DupSrcLane); 6792 6793 InsInstrs.push_back(MIB); 6794 return &Root; 6795 } 6796 6797 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 6798 /// instructions. 6799 /// 6800 /// \see genFusedMultiply 6801 static MachineInstr *genFusedMultiplyAcc( 6802 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6803 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6804 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6805 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6806 FMAInstKind::Accumulator); 6807 } 6808 6809 /// genNeg - Helper to generate an intermediate negation of the second operand 6810 /// of Root 6811 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 6812 const TargetInstrInfo *TII, MachineInstr &Root, 6813 SmallVectorImpl<MachineInstr *> &InsInstrs, 6814 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 6815 unsigned MnegOpc, const TargetRegisterClass *RC) { 6816 Register NewVR = MRI.createVirtualRegister(RC); 6817 MachineInstrBuilder MIB = 6818 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 6819 .add(Root.getOperand(2)); 6820 InsInstrs.push_back(MIB); 6821 6822 assert(InstrIdxForVirtReg.empty()); 6823 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6824 6825 return NewVR; 6826 } 6827 6828 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6829 /// instructions with an additional negation of the accumulator 6830 static MachineInstr *genFusedMultiplyAccNeg( 6831 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6832 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6833 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6834 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6835 assert(IdxMulOpd == 1); 6836 6837 Register NewVR = 6838 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6839 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6840 FMAInstKind::Accumulator, &NewVR); 6841 } 6842 6843 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 6844 /// instructions. 6845 /// 6846 /// \see genFusedMultiply 6847 static MachineInstr *genFusedMultiplyIdx( 6848 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6849 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6850 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6851 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6852 FMAInstKind::Indexed); 6853 } 6854 6855 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6856 /// instructions with an additional negation of the accumulator 6857 static MachineInstr *genFusedMultiplyIdxNeg( 6858 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6859 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6860 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6861 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6862 assert(IdxMulOpd == 1); 6863 6864 Register NewVR = 6865 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6866 6867 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6868 FMAInstKind::Indexed, &NewVR); 6869 } 6870 6871 /// genMaddR - Generate madd instruction and combine mul and add using 6872 /// an extra virtual register 6873 /// Example - an ADD intermediate needs to be stored in a register: 6874 /// MUL I=A,B,0 6875 /// ADD R,I,Imm 6876 /// ==> ORR V, ZR, Imm 6877 /// ==> MADD R,A,B,V 6878 /// \param MF Containing MachineFunction 6879 /// \param MRI Register information 6880 /// \param TII Target information 6881 /// \param Root is the ADD instruction 6882 /// \param [out] InsInstrs is a vector of machine instructions and will 6883 /// contain the generated madd instruction 6884 /// \param IdxMulOpd is index of operand in Root that is the result of 6885 /// the MUL. In the example above IdxMulOpd is 1. 6886 /// \param MaddOpc the opcode fo the madd instruction 6887 /// \param VR is a virtual register that holds the value of an ADD operand 6888 /// (V in the example above). 6889 /// \param RC Register class of operands 6890 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 6891 const TargetInstrInfo *TII, MachineInstr &Root, 6892 SmallVectorImpl<MachineInstr *> &InsInstrs, 6893 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 6894 const TargetRegisterClass *RC) { 6895 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6896 6897 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6898 Register ResultReg = Root.getOperand(0).getReg(); 6899 Register SrcReg0 = MUL->getOperand(1).getReg(); 6900 bool Src0IsKill = MUL->getOperand(1).isKill(); 6901 Register SrcReg1 = MUL->getOperand(2).getReg(); 6902 bool Src1IsKill = MUL->getOperand(2).isKill(); 6903 6904 if (ResultReg.isVirtual()) 6905 MRI.constrainRegClass(ResultReg, RC); 6906 if (SrcReg0.isVirtual()) 6907 MRI.constrainRegClass(SrcReg0, RC); 6908 if (SrcReg1.isVirtual()) 6909 MRI.constrainRegClass(SrcReg1, RC); 6910 if (Register::isVirtualRegister(VR)) 6911 MRI.constrainRegClass(VR, RC); 6912 6913 MachineInstrBuilder MIB = 6914 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6915 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6916 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6917 .addReg(VR); 6918 // Insert the MADD 6919 InsInstrs.push_back(MIB); 6920 return MUL; 6921 } 6922 6923 /// Do the following transformation 6924 /// A - (B + C) ==> (A - B) - C 6925 /// A - (B + C) ==> (A - C) - B 6926 static void 6927 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 6928 const TargetInstrInfo *TII, MachineInstr &Root, 6929 SmallVectorImpl<MachineInstr *> &InsInstrs, 6930 SmallVectorImpl<MachineInstr *> &DelInstrs, 6931 unsigned IdxOpd1, 6932 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 6933 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 6934 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 6935 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 6936 6937 Register ResultReg = Root.getOperand(0).getReg(); 6938 Register RegA = Root.getOperand(1).getReg(); 6939 bool RegAIsKill = Root.getOperand(1).isKill(); 6940 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 6941 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 6942 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 6943 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 6944 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 6945 6946 unsigned Opcode = Root.getOpcode(); 6947 if (Opcode == AArch64::SUBSWrr) 6948 Opcode = AArch64::SUBWrr; 6949 else if (Opcode == AArch64::SUBSXrr) 6950 Opcode = AArch64::SUBXrr; 6951 else 6952 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 6953 "Unexpected instruction opcode."); 6954 6955 uint32_t Flags = Root.mergeFlagsWith(*AddMI); 6956 Flags &= ~MachineInstr::NoSWrap; 6957 Flags &= ~MachineInstr::NoUWrap; 6958 6959 MachineInstrBuilder MIB1 = 6960 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 6961 .addReg(RegA, getKillRegState(RegAIsKill)) 6962 .addReg(RegB, getKillRegState(RegBIsKill)) 6963 .setMIFlags(Flags); 6964 MachineInstrBuilder MIB2 = 6965 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 6966 .addReg(NewVR, getKillRegState(true)) 6967 .addReg(RegC, getKillRegState(RegCIsKill)) 6968 .setMIFlags(Flags); 6969 6970 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6971 InsInstrs.push_back(MIB1); 6972 InsInstrs.push_back(MIB2); 6973 DelInstrs.push_back(AddMI); 6974 DelInstrs.push_back(&Root); 6975 } 6976 6977 /// When getMachineCombinerPatterns() finds potential patterns, 6978 /// this function generates the instructions that could replace the 6979 /// original code sequence 6980 void AArch64InstrInfo::genAlternativeCodeSequence( 6981 MachineInstr &Root, unsigned Pattern, 6982 SmallVectorImpl<MachineInstr *> &InsInstrs, 6983 SmallVectorImpl<MachineInstr *> &DelInstrs, 6984 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 6985 MachineBasicBlock &MBB = *Root.getParent(); 6986 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6987 MachineFunction &MF = *MBB.getParent(); 6988 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6989 6990 MachineInstr *MUL = nullptr; 6991 const TargetRegisterClass *RC; 6992 unsigned Opc; 6993 switch (Pattern) { 6994 default: 6995 // Reassociate instructions. 6996 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 6997 DelInstrs, InstrIdxForVirtReg); 6998 return; 6999 case AArch64MachineCombinerPattern::SUBADD_OP1: 7000 // A - (B + C) 7001 // ==> (A - B) - C 7002 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 7003 InstrIdxForVirtReg); 7004 return; 7005 case AArch64MachineCombinerPattern::SUBADD_OP2: 7006 // A - (B + C) 7007 // ==> (A - C) - B 7008 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 7009 InstrIdxForVirtReg); 7010 return; 7011 case AArch64MachineCombinerPattern::MULADDW_OP1: 7012 case AArch64MachineCombinerPattern::MULADDX_OP1: 7013 // MUL I=A,B,0 7014 // ADD R,I,C 7015 // ==> MADD R,A,B,C 7016 // --- Create(MADD); 7017 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) { 7018 Opc = AArch64::MADDWrrr; 7019 RC = &AArch64::GPR32RegClass; 7020 } else { 7021 Opc = AArch64::MADDXrrr; 7022 RC = &AArch64::GPR64RegClass; 7023 } 7024 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7025 break; 7026 case AArch64MachineCombinerPattern::MULADDW_OP2: 7027 case AArch64MachineCombinerPattern::MULADDX_OP2: 7028 // MUL I=A,B,0 7029 // ADD R,C,I 7030 // ==> MADD R,A,B,C 7031 // --- Create(MADD); 7032 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) { 7033 Opc = AArch64::MADDWrrr; 7034 RC = &AArch64::GPR32RegClass; 7035 } else { 7036 Opc = AArch64::MADDXrrr; 7037 RC = &AArch64::GPR64RegClass; 7038 } 7039 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7040 break; 7041 case AArch64MachineCombinerPattern::MULADDWI_OP1: 7042 case AArch64MachineCombinerPattern::MULADDXI_OP1: { 7043 // MUL I=A,B,0 7044 // ADD R,I,Imm 7045 // ==> MOV V, Imm 7046 // ==> MADD R,A,B,V 7047 // --- Create(MADD); 7048 const TargetRegisterClass *OrrRC; 7049 unsigned BitSize, OrrOpc, ZeroReg; 7050 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) { 7051 OrrOpc = AArch64::ORRWri; 7052 OrrRC = &AArch64::GPR32spRegClass; 7053 BitSize = 32; 7054 ZeroReg = AArch64::WZR; 7055 Opc = AArch64::MADDWrrr; 7056 RC = &AArch64::GPR32RegClass; 7057 } else { 7058 OrrOpc = AArch64::ORRXri; 7059 OrrRC = &AArch64::GPR64spRegClass; 7060 BitSize = 64; 7061 ZeroReg = AArch64::XZR; 7062 Opc = AArch64::MADDXrrr; 7063 RC = &AArch64::GPR64RegClass; 7064 } 7065 Register NewVR = MRI.createVirtualRegister(OrrRC); 7066 uint64_t Imm = Root.getOperand(2).getImm(); 7067 7068 if (Root.getOperand(3).isImm()) { 7069 unsigned Val = Root.getOperand(3).getImm(); 7070 Imm = Imm << Val; 7071 } 7072 uint64_t UImm = SignExtend64(Imm, BitSize); 7073 // The immediate can be composed via a single instruction. 7074 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7075 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7076 if (Insn.size() != 1) 7077 return; 7078 auto MovI = Insn.begin(); 7079 MachineInstrBuilder MIB1; 7080 // MOV is an alias for one of three instructions: movz, movn, and orr. 7081 if (MovI->Opcode == OrrOpc) 7082 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7083 .addReg(ZeroReg) 7084 .addImm(MovI->Op2); 7085 else { 7086 if (BitSize == 32) 7087 assert((MovI->Opcode == AArch64::MOVNWi || 7088 MovI->Opcode == AArch64::MOVZWi) && 7089 "Expected opcode"); 7090 else 7091 assert((MovI->Opcode == AArch64::MOVNXi || 7092 MovI->Opcode == AArch64::MOVZXi) && 7093 "Expected opcode"); 7094 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7095 .addImm(MovI->Op1) 7096 .addImm(MovI->Op2); 7097 } 7098 InsInstrs.push_back(MIB1); 7099 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7100 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7101 break; 7102 } 7103 case AArch64MachineCombinerPattern::MULSUBW_OP1: 7104 case AArch64MachineCombinerPattern::MULSUBX_OP1: { 7105 // MUL I=A,B,0 7106 // SUB R,I, C 7107 // ==> SUB V, 0, C 7108 // ==> MADD R,A,B,V // = -C + A*B 7109 // --- Create(MADD); 7110 const TargetRegisterClass *SubRC; 7111 unsigned SubOpc, ZeroReg; 7112 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) { 7113 SubOpc = AArch64::SUBWrr; 7114 SubRC = &AArch64::GPR32spRegClass; 7115 ZeroReg = AArch64::WZR; 7116 Opc = AArch64::MADDWrrr; 7117 RC = &AArch64::GPR32RegClass; 7118 } else { 7119 SubOpc = AArch64::SUBXrr; 7120 SubRC = &AArch64::GPR64spRegClass; 7121 ZeroReg = AArch64::XZR; 7122 Opc = AArch64::MADDXrrr; 7123 RC = &AArch64::GPR64RegClass; 7124 } 7125 Register NewVR = MRI.createVirtualRegister(SubRC); 7126 // SUB NewVR, 0, C 7127 MachineInstrBuilder MIB1 = 7128 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 7129 .addReg(ZeroReg) 7130 .add(Root.getOperand(2)); 7131 InsInstrs.push_back(MIB1); 7132 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7133 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7134 break; 7135 } 7136 case AArch64MachineCombinerPattern::MULSUBW_OP2: 7137 case AArch64MachineCombinerPattern::MULSUBX_OP2: 7138 // MUL I=A,B,0 7139 // SUB R,C,I 7140 // ==> MSUB R,A,B,C (computes C - A*B) 7141 // --- Create(MSUB); 7142 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) { 7143 Opc = AArch64::MSUBWrrr; 7144 RC = &AArch64::GPR32RegClass; 7145 } else { 7146 Opc = AArch64::MSUBXrrr; 7147 RC = &AArch64::GPR64RegClass; 7148 } 7149 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7150 break; 7151 case AArch64MachineCombinerPattern::MULSUBWI_OP1: 7152 case AArch64MachineCombinerPattern::MULSUBXI_OP1: { 7153 // MUL I=A,B,0 7154 // SUB R,I, Imm 7155 // ==> MOV V, -Imm 7156 // ==> MADD R,A,B,V // = -Imm + A*B 7157 // --- Create(MADD); 7158 const TargetRegisterClass *OrrRC; 7159 unsigned BitSize, OrrOpc, ZeroReg; 7160 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) { 7161 OrrOpc = AArch64::ORRWri; 7162 OrrRC = &AArch64::GPR32spRegClass; 7163 BitSize = 32; 7164 ZeroReg = AArch64::WZR; 7165 Opc = AArch64::MADDWrrr; 7166 RC = &AArch64::GPR32RegClass; 7167 } else { 7168 OrrOpc = AArch64::ORRXri; 7169 OrrRC = &AArch64::GPR64spRegClass; 7170 BitSize = 64; 7171 ZeroReg = AArch64::XZR; 7172 Opc = AArch64::MADDXrrr; 7173 RC = &AArch64::GPR64RegClass; 7174 } 7175 Register NewVR = MRI.createVirtualRegister(OrrRC); 7176 uint64_t Imm = Root.getOperand(2).getImm(); 7177 if (Root.getOperand(3).isImm()) { 7178 unsigned Val = Root.getOperand(3).getImm(); 7179 Imm = Imm << Val; 7180 } 7181 uint64_t UImm = SignExtend64(-Imm, BitSize); 7182 // The immediate can be composed via a single instruction. 7183 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7184 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7185 if (Insn.size() != 1) 7186 return; 7187 auto MovI = Insn.begin(); 7188 MachineInstrBuilder MIB1; 7189 // MOV is an alias for one of three instructions: movz, movn, and orr. 7190 if (MovI->Opcode == OrrOpc) 7191 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7192 .addReg(ZeroReg) 7193 .addImm(MovI->Op2); 7194 else { 7195 if (BitSize == 32) 7196 assert((MovI->Opcode == AArch64::MOVNWi || 7197 MovI->Opcode == AArch64::MOVZWi) && 7198 "Expected opcode"); 7199 else 7200 assert((MovI->Opcode == AArch64::MOVNXi || 7201 MovI->Opcode == AArch64::MOVZXi) && 7202 "Expected opcode"); 7203 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7204 .addImm(MovI->Op1) 7205 .addImm(MovI->Op2); 7206 } 7207 InsInstrs.push_back(MIB1); 7208 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7209 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7210 break; 7211 } 7212 7213 case AArch64MachineCombinerPattern::MULADDv8i8_OP1: 7214 Opc = AArch64::MLAv8i8; 7215 RC = &AArch64::FPR64RegClass; 7216 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7217 break; 7218 case AArch64MachineCombinerPattern::MULADDv8i8_OP2: 7219 Opc = AArch64::MLAv8i8; 7220 RC = &AArch64::FPR64RegClass; 7221 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7222 break; 7223 case AArch64MachineCombinerPattern::MULADDv16i8_OP1: 7224 Opc = AArch64::MLAv16i8; 7225 RC = &AArch64::FPR128RegClass; 7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7227 break; 7228 case AArch64MachineCombinerPattern::MULADDv16i8_OP2: 7229 Opc = AArch64::MLAv16i8; 7230 RC = &AArch64::FPR128RegClass; 7231 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7232 break; 7233 case AArch64MachineCombinerPattern::MULADDv4i16_OP1: 7234 Opc = AArch64::MLAv4i16; 7235 RC = &AArch64::FPR64RegClass; 7236 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7237 break; 7238 case AArch64MachineCombinerPattern::MULADDv4i16_OP2: 7239 Opc = AArch64::MLAv4i16; 7240 RC = &AArch64::FPR64RegClass; 7241 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7242 break; 7243 case AArch64MachineCombinerPattern::MULADDv8i16_OP1: 7244 Opc = AArch64::MLAv8i16; 7245 RC = &AArch64::FPR128RegClass; 7246 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7247 break; 7248 case AArch64MachineCombinerPattern::MULADDv8i16_OP2: 7249 Opc = AArch64::MLAv8i16; 7250 RC = &AArch64::FPR128RegClass; 7251 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7252 break; 7253 case AArch64MachineCombinerPattern::MULADDv2i32_OP1: 7254 Opc = AArch64::MLAv2i32; 7255 RC = &AArch64::FPR64RegClass; 7256 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7257 break; 7258 case AArch64MachineCombinerPattern::MULADDv2i32_OP2: 7259 Opc = AArch64::MLAv2i32; 7260 RC = &AArch64::FPR64RegClass; 7261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7262 break; 7263 case AArch64MachineCombinerPattern::MULADDv4i32_OP1: 7264 Opc = AArch64::MLAv4i32; 7265 RC = &AArch64::FPR128RegClass; 7266 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7267 break; 7268 case AArch64MachineCombinerPattern::MULADDv4i32_OP2: 7269 Opc = AArch64::MLAv4i32; 7270 RC = &AArch64::FPR128RegClass; 7271 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7272 break; 7273 7274 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1: 7275 Opc = AArch64::MLAv8i8; 7276 RC = &AArch64::FPR64RegClass; 7277 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7278 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 7279 RC); 7280 break; 7281 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2: 7282 Opc = AArch64::MLSv8i8; 7283 RC = &AArch64::FPR64RegClass; 7284 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7285 break; 7286 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1: 7287 Opc = AArch64::MLAv16i8; 7288 RC = &AArch64::FPR128RegClass; 7289 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7290 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 7291 RC); 7292 break; 7293 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2: 7294 Opc = AArch64::MLSv16i8; 7295 RC = &AArch64::FPR128RegClass; 7296 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7297 break; 7298 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1: 7299 Opc = AArch64::MLAv4i16; 7300 RC = &AArch64::FPR64RegClass; 7301 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7302 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7303 RC); 7304 break; 7305 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2: 7306 Opc = AArch64::MLSv4i16; 7307 RC = &AArch64::FPR64RegClass; 7308 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7309 break; 7310 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1: 7311 Opc = AArch64::MLAv8i16; 7312 RC = &AArch64::FPR128RegClass; 7313 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7314 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7315 RC); 7316 break; 7317 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2: 7318 Opc = AArch64::MLSv8i16; 7319 RC = &AArch64::FPR128RegClass; 7320 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7321 break; 7322 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1: 7323 Opc = AArch64::MLAv2i32; 7324 RC = &AArch64::FPR64RegClass; 7325 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7326 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7327 RC); 7328 break; 7329 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2: 7330 Opc = AArch64::MLSv2i32; 7331 RC = &AArch64::FPR64RegClass; 7332 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7333 break; 7334 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1: 7335 Opc = AArch64::MLAv4i32; 7336 RC = &AArch64::FPR128RegClass; 7337 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7338 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7339 RC); 7340 break; 7341 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2: 7342 Opc = AArch64::MLSv4i32; 7343 RC = &AArch64::FPR128RegClass; 7344 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7345 break; 7346 7347 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1: 7348 Opc = AArch64::MLAv4i16_indexed; 7349 RC = &AArch64::FPR64RegClass; 7350 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7351 break; 7352 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2: 7353 Opc = AArch64::MLAv4i16_indexed; 7354 RC = &AArch64::FPR64RegClass; 7355 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7356 break; 7357 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1: 7358 Opc = AArch64::MLAv8i16_indexed; 7359 RC = &AArch64::FPR128RegClass; 7360 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7361 break; 7362 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2: 7363 Opc = AArch64::MLAv8i16_indexed; 7364 RC = &AArch64::FPR128RegClass; 7365 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7366 break; 7367 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1: 7368 Opc = AArch64::MLAv2i32_indexed; 7369 RC = &AArch64::FPR64RegClass; 7370 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7371 break; 7372 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2: 7373 Opc = AArch64::MLAv2i32_indexed; 7374 RC = &AArch64::FPR64RegClass; 7375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7376 break; 7377 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1: 7378 Opc = AArch64::MLAv4i32_indexed; 7379 RC = &AArch64::FPR128RegClass; 7380 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7381 break; 7382 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2: 7383 Opc = AArch64::MLAv4i32_indexed; 7384 RC = &AArch64::FPR128RegClass; 7385 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7386 break; 7387 7388 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 7389 Opc = AArch64::MLAv4i16_indexed; 7390 RC = &AArch64::FPR64RegClass; 7391 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7392 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7393 RC); 7394 break; 7395 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 7396 Opc = AArch64::MLSv4i16_indexed; 7397 RC = &AArch64::FPR64RegClass; 7398 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7399 break; 7400 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 7401 Opc = AArch64::MLAv8i16_indexed; 7402 RC = &AArch64::FPR128RegClass; 7403 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7404 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7405 RC); 7406 break; 7407 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 7408 Opc = AArch64::MLSv8i16_indexed; 7409 RC = &AArch64::FPR128RegClass; 7410 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7411 break; 7412 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 7413 Opc = AArch64::MLAv2i32_indexed; 7414 RC = &AArch64::FPR64RegClass; 7415 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7416 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7417 RC); 7418 break; 7419 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 7420 Opc = AArch64::MLSv2i32_indexed; 7421 RC = &AArch64::FPR64RegClass; 7422 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7423 break; 7424 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 7425 Opc = AArch64::MLAv4i32_indexed; 7426 RC = &AArch64::FPR128RegClass; 7427 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7428 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7429 RC); 7430 break; 7431 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 7432 Opc = AArch64::MLSv4i32_indexed; 7433 RC = &AArch64::FPR128RegClass; 7434 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7435 break; 7436 7437 // Floating Point Support 7438 case AArch64MachineCombinerPattern::FMULADDH_OP1: 7439 Opc = AArch64::FMADDHrrr; 7440 RC = &AArch64::FPR16RegClass; 7441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7442 break; 7443 case AArch64MachineCombinerPattern::FMULADDS_OP1: 7444 Opc = AArch64::FMADDSrrr; 7445 RC = &AArch64::FPR32RegClass; 7446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7447 break; 7448 case AArch64MachineCombinerPattern::FMULADDD_OP1: 7449 Opc = AArch64::FMADDDrrr; 7450 RC = &AArch64::FPR64RegClass; 7451 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7452 break; 7453 7454 case AArch64MachineCombinerPattern::FMULADDH_OP2: 7455 Opc = AArch64::FMADDHrrr; 7456 RC = &AArch64::FPR16RegClass; 7457 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7458 break; 7459 case AArch64MachineCombinerPattern::FMULADDS_OP2: 7460 Opc = AArch64::FMADDSrrr; 7461 RC = &AArch64::FPR32RegClass; 7462 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7463 break; 7464 case AArch64MachineCombinerPattern::FMULADDD_OP2: 7465 Opc = AArch64::FMADDDrrr; 7466 RC = &AArch64::FPR64RegClass; 7467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7468 break; 7469 7470 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1: 7471 Opc = AArch64::FMLAv1i32_indexed; 7472 RC = &AArch64::FPR32RegClass; 7473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7474 FMAInstKind::Indexed); 7475 break; 7476 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2: 7477 Opc = AArch64::FMLAv1i32_indexed; 7478 RC = &AArch64::FPR32RegClass; 7479 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7480 FMAInstKind::Indexed); 7481 break; 7482 7483 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1: 7484 Opc = AArch64::FMLAv1i64_indexed; 7485 RC = &AArch64::FPR64RegClass; 7486 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7487 FMAInstKind::Indexed); 7488 break; 7489 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2: 7490 Opc = AArch64::FMLAv1i64_indexed; 7491 RC = &AArch64::FPR64RegClass; 7492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7493 FMAInstKind::Indexed); 7494 break; 7495 7496 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1: 7497 RC = &AArch64::FPR64RegClass; 7498 Opc = AArch64::FMLAv4i16_indexed; 7499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7500 FMAInstKind::Indexed); 7501 break; 7502 case AArch64MachineCombinerPattern::FMLAv4f16_OP1: 7503 RC = &AArch64::FPR64RegClass; 7504 Opc = AArch64::FMLAv4f16; 7505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7506 FMAInstKind::Accumulator); 7507 break; 7508 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2: 7509 RC = &AArch64::FPR64RegClass; 7510 Opc = AArch64::FMLAv4i16_indexed; 7511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7512 FMAInstKind::Indexed); 7513 break; 7514 case AArch64MachineCombinerPattern::FMLAv4f16_OP2: 7515 RC = &AArch64::FPR64RegClass; 7516 Opc = AArch64::FMLAv4f16; 7517 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7518 FMAInstKind::Accumulator); 7519 break; 7520 7521 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1: 7522 case AArch64MachineCombinerPattern::FMLAv2f32_OP1: 7523 RC = &AArch64::FPR64RegClass; 7524 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 7525 Opc = AArch64::FMLAv2i32_indexed; 7526 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7527 FMAInstKind::Indexed); 7528 } else { 7529 Opc = AArch64::FMLAv2f32; 7530 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7531 FMAInstKind::Accumulator); 7532 } 7533 break; 7534 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2: 7535 case AArch64MachineCombinerPattern::FMLAv2f32_OP2: 7536 RC = &AArch64::FPR64RegClass; 7537 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 7538 Opc = AArch64::FMLAv2i32_indexed; 7539 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7540 FMAInstKind::Indexed); 7541 } else { 7542 Opc = AArch64::FMLAv2f32; 7543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7544 FMAInstKind::Accumulator); 7545 } 7546 break; 7547 7548 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1: 7549 RC = &AArch64::FPR128RegClass; 7550 Opc = AArch64::FMLAv8i16_indexed; 7551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7552 FMAInstKind::Indexed); 7553 break; 7554 case AArch64MachineCombinerPattern::FMLAv8f16_OP1: 7555 RC = &AArch64::FPR128RegClass; 7556 Opc = AArch64::FMLAv8f16; 7557 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7558 FMAInstKind::Accumulator); 7559 break; 7560 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2: 7561 RC = &AArch64::FPR128RegClass; 7562 Opc = AArch64::FMLAv8i16_indexed; 7563 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7564 FMAInstKind::Indexed); 7565 break; 7566 case AArch64MachineCombinerPattern::FMLAv8f16_OP2: 7567 RC = &AArch64::FPR128RegClass; 7568 Opc = AArch64::FMLAv8f16; 7569 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7570 FMAInstKind::Accumulator); 7571 break; 7572 7573 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1: 7574 case AArch64MachineCombinerPattern::FMLAv2f64_OP1: 7575 RC = &AArch64::FPR128RegClass; 7576 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 7577 Opc = AArch64::FMLAv2i64_indexed; 7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7579 FMAInstKind::Indexed); 7580 } else { 7581 Opc = AArch64::FMLAv2f64; 7582 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7583 FMAInstKind::Accumulator); 7584 } 7585 break; 7586 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2: 7587 case AArch64MachineCombinerPattern::FMLAv2f64_OP2: 7588 RC = &AArch64::FPR128RegClass; 7589 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 7590 Opc = AArch64::FMLAv2i64_indexed; 7591 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7592 FMAInstKind::Indexed); 7593 } else { 7594 Opc = AArch64::FMLAv2f64; 7595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7596 FMAInstKind::Accumulator); 7597 } 7598 break; 7599 7600 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1: 7601 case AArch64MachineCombinerPattern::FMLAv4f32_OP1: 7602 RC = &AArch64::FPR128RegClass; 7603 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 7604 Opc = AArch64::FMLAv4i32_indexed; 7605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7606 FMAInstKind::Indexed); 7607 } else { 7608 Opc = AArch64::FMLAv4f32; 7609 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7610 FMAInstKind::Accumulator); 7611 } 7612 break; 7613 7614 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2: 7615 case AArch64MachineCombinerPattern::FMLAv4f32_OP2: 7616 RC = &AArch64::FPR128RegClass; 7617 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 7618 Opc = AArch64::FMLAv4i32_indexed; 7619 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7620 FMAInstKind::Indexed); 7621 } else { 7622 Opc = AArch64::FMLAv4f32; 7623 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7624 FMAInstKind::Accumulator); 7625 } 7626 break; 7627 7628 case AArch64MachineCombinerPattern::FMULSUBH_OP1: 7629 Opc = AArch64::FNMSUBHrrr; 7630 RC = &AArch64::FPR16RegClass; 7631 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7632 break; 7633 case AArch64MachineCombinerPattern::FMULSUBS_OP1: 7634 Opc = AArch64::FNMSUBSrrr; 7635 RC = &AArch64::FPR32RegClass; 7636 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7637 break; 7638 case AArch64MachineCombinerPattern::FMULSUBD_OP1: 7639 Opc = AArch64::FNMSUBDrrr; 7640 RC = &AArch64::FPR64RegClass; 7641 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7642 break; 7643 7644 case AArch64MachineCombinerPattern::FNMULSUBH_OP1: 7645 Opc = AArch64::FNMADDHrrr; 7646 RC = &AArch64::FPR16RegClass; 7647 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7648 break; 7649 case AArch64MachineCombinerPattern::FNMULSUBS_OP1: 7650 Opc = AArch64::FNMADDSrrr; 7651 RC = &AArch64::FPR32RegClass; 7652 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7653 break; 7654 case AArch64MachineCombinerPattern::FNMULSUBD_OP1: 7655 Opc = AArch64::FNMADDDrrr; 7656 RC = &AArch64::FPR64RegClass; 7657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7658 break; 7659 7660 case AArch64MachineCombinerPattern::FMULSUBH_OP2: 7661 Opc = AArch64::FMSUBHrrr; 7662 RC = &AArch64::FPR16RegClass; 7663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7664 break; 7665 case AArch64MachineCombinerPattern::FMULSUBS_OP2: 7666 Opc = AArch64::FMSUBSrrr; 7667 RC = &AArch64::FPR32RegClass; 7668 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7669 break; 7670 case AArch64MachineCombinerPattern::FMULSUBD_OP2: 7671 Opc = AArch64::FMSUBDrrr; 7672 RC = &AArch64::FPR64RegClass; 7673 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7674 break; 7675 7676 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2: 7677 Opc = AArch64::FMLSv1i32_indexed; 7678 RC = &AArch64::FPR32RegClass; 7679 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7680 FMAInstKind::Indexed); 7681 break; 7682 7683 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2: 7684 Opc = AArch64::FMLSv1i64_indexed; 7685 RC = &AArch64::FPR64RegClass; 7686 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7687 FMAInstKind::Indexed); 7688 break; 7689 7690 case AArch64MachineCombinerPattern::FMLSv4f16_OP1: 7691 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 7692 RC = &AArch64::FPR64RegClass; 7693 Register NewVR = MRI.createVirtualRegister(RC); 7694 MachineInstrBuilder MIB1 = 7695 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 7696 .add(Root.getOperand(2)); 7697 InsInstrs.push_back(MIB1); 7698 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7699 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) { 7700 Opc = AArch64::FMLAv4f16; 7701 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7702 FMAInstKind::Accumulator, &NewVR); 7703 } else { 7704 Opc = AArch64::FMLAv4i16_indexed; 7705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7706 FMAInstKind::Indexed, &NewVR); 7707 } 7708 break; 7709 } 7710 case AArch64MachineCombinerPattern::FMLSv4f16_OP2: 7711 RC = &AArch64::FPR64RegClass; 7712 Opc = AArch64::FMLSv4f16; 7713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7714 FMAInstKind::Accumulator); 7715 break; 7716 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2: 7717 RC = &AArch64::FPR64RegClass; 7718 Opc = AArch64::FMLSv4i16_indexed; 7719 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7720 FMAInstKind::Indexed); 7721 break; 7722 7723 case AArch64MachineCombinerPattern::FMLSv2f32_OP2: 7724 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2: 7725 RC = &AArch64::FPR64RegClass; 7726 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 7727 Opc = AArch64::FMLSv2i32_indexed; 7728 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7729 FMAInstKind::Indexed); 7730 } else { 7731 Opc = AArch64::FMLSv2f32; 7732 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7733 FMAInstKind::Accumulator); 7734 } 7735 break; 7736 7737 case AArch64MachineCombinerPattern::FMLSv8f16_OP1: 7738 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 7739 RC = &AArch64::FPR128RegClass; 7740 Register NewVR = MRI.createVirtualRegister(RC); 7741 MachineInstrBuilder MIB1 = 7742 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 7743 .add(Root.getOperand(2)); 7744 InsInstrs.push_back(MIB1); 7745 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7746 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) { 7747 Opc = AArch64::FMLAv8f16; 7748 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7749 FMAInstKind::Accumulator, &NewVR); 7750 } else { 7751 Opc = AArch64::FMLAv8i16_indexed; 7752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7753 FMAInstKind::Indexed, &NewVR); 7754 } 7755 break; 7756 } 7757 case AArch64MachineCombinerPattern::FMLSv8f16_OP2: 7758 RC = &AArch64::FPR128RegClass; 7759 Opc = AArch64::FMLSv8f16; 7760 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7761 FMAInstKind::Accumulator); 7762 break; 7763 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2: 7764 RC = &AArch64::FPR128RegClass; 7765 Opc = AArch64::FMLSv8i16_indexed; 7766 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7767 FMAInstKind::Indexed); 7768 break; 7769 7770 case AArch64MachineCombinerPattern::FMLSv2f64_OP2: 7771 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2: 7772 RC = &AArch64::FPR128RegClass; 7773 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 7774 Opc = AArch64::FMLSv2i64_indexed; 7775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7776 FMAInstKind::Indexed); 7777 } else { 7778 Opc = AArch64::FMLSv2f64; 7779 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7780 FMAInstKind::Accumulator); 7781 } 7782 break; 7783 7784 case AArch64MachineCombinerPattern::FMLSv4f32_OP2: 7785 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2: 7786 RC = &AArch64::FPR128RegClass; 7787 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 7788 Opc = AArch64::FMLSv4i32_indexed; 7789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7790 FMAInstKind::Indexed); 7791 } else { 7792 Opc = AArch64::FMLSv4f32; 7793 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7794 FMAInstKind::Accumulator); 7795 } 7796 break; 7797 case AArch64MachineCombinerPattern::FMLSv2f32_OP1: 7798 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 7799 RC = &AArch64::FPR64RegClass; 7800 Register NewVR = MRI.createVirtualRegister(RC); 7801 MachineInstrBuilder MIB1 = 7802 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 7803 .add(Root.getOperand(2)); 7804 InsInstrs.push_back(MIB1); 7805 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7806 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 7807 Opc = AArch64::FMLAv2i32_indexed; 7808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7809 FMAInstKind::Indexed, &NewVR); 7810 } else { 7811 Opc = AArch64::FMLAv2f32; 7812 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7813 FMAInstKind::Accumulator, &NewVR); 7814 } 7815 break; 7816 } 7817 case AArch64MachineCombinerPattern::FMLSv4f32_OP1: 7818 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 7819 RC = &AArch64::FPR128RegClass; 7820 Register NewVR = MRI.createVirtualRegister(RC); 7821 MachineInstrBuilder MIB1 = 7822 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 7823 .add(Root.getOperand(2)); 7824 InsInstrs.push_back(MIB1); 7825 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7826 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 7827 Opc = AArch64::FMLAv4i32_indexed; 7828 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7829 FMAInstKind::Indexed, &NewVR); 7830 } else { 7831 Opc = AArch64::FMLAv4f32; 7832 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7833 FMAInstKind::Accumulator, &NewVR); 7834 } 7835 break; 7836 } 7837 case AArch64MachineCombinerPattern::FMLSv2f64_OP1: 7838 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 7839 RC = &AArch64::FPR128RegClass; 7840 Register NewVR = MRI.createVirtualRegister(RC); 7841 MachineInstrBuilder MIB1 = 7842 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 7843 .add(Root.getOperand(2)); 7844 InsInstrs.push_back(MIB1); 7845 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7846 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 7847 Opc = AArch64::FMLAv2i64_indexed; 7848 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7849 FMAInstKind::Indexed, &NewVR); 7850 } else { 7851 Opc = AArch64::FMLAv2f64; 7852 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7853 FMAInstKind::Accumulator, &NewVR); 7854 } 7855 break; 7856 } 7857 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1: 7858 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: { 7859 unsigned IdxDupOp = 7860 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 7861 : 2; 7862 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 7863 &AArch64::FPR128RegClass, MRI); 7864 break; 7865 } 7866 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1: 7867 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: { 7868 unsigned IdxDupOp = 7869 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 7870 : 2; 7871 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 7872 &AArch64::FPR128RegClass, MRI); 7873 break; 7874 } 7875 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1: 7876 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: { 7877 unsigned IdxDupOp = 7878 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 7879 : 2; 7880 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 7881 &AArch64::FPR128_loRegClass, MRI); 7882 break; 7883 } 7884 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1: 7885 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: { 7886 unsigned IdxDupOp = 7887 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 7888 : 2; 7889 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 7890 &AArch64::FPR128RegClass, MRI); 7891 break; 7892 } 7893 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1: 7894 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: { 7895 unsigned IdxDupOp = 7896 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 7897 : 2; 7898 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 7899 &AArch64::FPR128_loRegClass, MRI); 7900 break; 7901 } 7902 case AArch64MachineCombinerPattern::FNMADD: { 7903 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 7904 break; 7905 } 7906 7907 } // end switch (Pattern) 7908 // Record MUL and ADD/SUB for deletion 7909 if (MUL) 7910 DelInstrs.push_back(MUL); 7911 DelInstrs.push_back(&Root); 7912 7913 // Set the flags on the inserted instructions to be the merged flags of the 7914 // instructions that we have combined. 7915 uint32_t Flags = Root.getFlags(); 7916 if (MUL) 7917 Flags = Root.mergeFlagsWith(*MUL); 7918 for (auto *MI : InsInstrs) 7919 MI->setFlags(Flags); 7920 } 7921 7922 /// Replace csincr-branch sequence by simple conditional branch 7923 /// 7924 /// Examples: 7925 /// 1. \code 7926 /// csinc w9, wzr, wzr, <condition code> 7927 /// tbnz w9, #0, 0x44 7928 /// \endcode 7929 /// to 7930 /// \code 7931 /// b.<inverted condition code> 7932 /// \endcode 7933 /// 7934 /// 2. \code 7935 /// csinc w9, wzr, wzr, <condition code> 7936 /// tbz w9, #0, 0x44 7937 /// \endcode 7938 /// to 7939 /// \code 7940 /// b.<condition code> 7941 /// \endcode 7942 /// 7943 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 7944 /// compare's constant operand is power of 2. 7945 /// 7946 /// Examples: 7947 /// \code 7948 /// and w8, w8, #0x400 7949 /// cbnz w8, L1 7950 /// \endcode 7951 /// to 7952 /// \code 7953 /// tbnz w8, #10, L1 7954 /// \endcode 7955 /// 7956 /// \param MI Conditional Branch 7957 /// \return True when the simple conditional branch is generated 7958 /// 7959 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 7960 bool IsNegativeBranch = false; 7961 bool IsTestAndBranch = false; 7962 unsigned TargetBBInMI = 0; 7963 switch (MI.getOpcode()) { 7964 default: 7965 llvm_unreachable("Unknown branch instruction?"); 7966 case AArch64::Bcc: 7967 return false; 7968 case AArch64::CBZW: 7969 case AArch64::CBZX: 7970 TargetBBInMI = 1; 7971 break; 7972 case AArch64::CBNZW: 7973 case AArch64::CBNZX: 7974 TargetBBInMI = 1; 7975 IsNegativeBranch = true; 7976 break; 7977 case AArch64::TBZW: 7978 case AArch64::TBZX: 7979 TargetBBInMI = 2; 7980 IsTestAndBranch = true; 7981 break; 7982 case AArch64::TBNZW: 7983 case AArch64::TBNZX: 7984 TargetBBInMI = 2; 7985 IsNegativeBranch = true; 7986 IsTestAndBranch = true; 7987 break; 7988 } 7989 // So we increment a zero register and test for bits other 7990 // than bit 0? Conservatively bail out in case the verifier 7991 // missed this case. 7992 if (IsTestAndBranch && MI.getOperand(1).getImm()) 7993 return false; 7994 7995 // Find Definition. 7996 assert(MI.getParent() && "Incomplete machine instruciton\n"); 7997 MachineBasicBlock *MBB = MI.getParent(); 7998 MachineFunction *MF = MBB->getParent(); 7999 MachineRegisterInfo *MRI = &MF->getRegInfo(); 8000 Register VReg = MI.getOperand(0).getReg(); 8001 if (!VReg.isVirtual()) 8002 return false; 8003 8004 MachineInstr *DefMI = MRI->getVRegDef(VReg); 8005 8006 // Look through COPY instructions to find definition. 8007 while (DefMI->isCopy()) { 8008 Register CopyVReg = DefMI->getOperand(1).getReg(); 8009 if (!MRI->hasOneNonDBGUse(CopyVReg)) 8010 return false; 8011 if (!MRI->hasOneDef(CopyVReg)) 8012 return false; 8013 DefMI = MRI->getVRegDef(CopyVReg); 8014 } 8015 8016 switch (DefMI->getOpcode()) { 8017 default: 8018 return false; 8019 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 8020 case AArch64::ANDWri: 8021 case AArch64::ANDXri: { 8022 if (IsTestAndBranch) 8023 return false; 8024 if (DefMI->getParent() != MBB) 8025 return false; 8026 if (!MRI->hasOneNonDBGUse(VReg)) 8027 return false; 8028 8029 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 8030 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 8031 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 8032 if (!isPowerOf2_64(Mask)) 8033 return false; 8034 8035 MachineOperand &MO = DefMI->getOperand(1); 8036 Register NewReg = MO.getReg(); 8037 if (!NewReg.isVirtual()) 8038 return false; 8039 8040 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 8041 8042 MachineBasicBlock &RefToMBB = *MBB; 8043 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 8044 DebugLoc DL = MI.getDebugLoc(); 8045 unsigned Imm = Log2_64(Mask); 8046 unsigned Opc = (Imm < 32) 8047 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 8048 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 8049 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 8050 .addReg(NewReg) 8051 .addImm(Imm) 8052 .addMBB(TBB); 8053 // Register lives on to the CBZ now. 8054 MO.setIsKill(false); 8055 8056 // For immediate smaller than 32, we need to use the 32-bit 8057 // variant (W) in all cases. Indeed the 64-bit variant does not 8058 // allow to encode them. 8059 // Therefore, if the input register is 64-bit, we need to take the 8060 // 32-bit sub-part. 8061 if (!Is32Bit && Imm < 32) 8062 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 8063 MI.eraseFromParent(); 8064 return true; 8065 } 8066 // Look for CSINC 8067 case AArch64::CSINCWr: 8068 case AArch64::CSINCXr: { 8069 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 8070 DefMI->getOperand(2).getReg() == AArch64::WZR) && 8071 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 8072 DefMI->getOperand(2).getReg() == AArch64::XZR)) 8073 return false; 8074 8075 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, 8076 true) != -1) 8077 return false; 8078 8079 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 8080 // Convert only when the condition code is not modified between 8081 // the CSINC and the branch. The CC may be used by other 8082 // instructions in between. 8083 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 8084 return false; 8085 MachineBasicBlock &RefToMBB = *MBB; 8086 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 8087 DebugLoc DL = MI.getDebugLoc(); 8088 if (IsNegativeBranch) 8089 CC = AArch64CC::getInvertedCondCode(CC); 8090 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 8091 MI.eraseFromParent(); 8092 return true; 8093 } 8094 } 8095 } 8096 8097 std::pair<unsigned, unsigned> 8098 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 8099 const unsigned Mask = AArch64II::MO_FRAGMENT; 8100 return std::make_pair(TF & Mask, TF & ~Mask); 8101 } 8102 8103 ArrayRef<std::pair<unsigned, const char *>> 8104 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 8105 using namespace AArch64II; 8106 8107 static const std::pair<unsigned, const char *> TargetFlags[] = { 8108 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 8109 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 8110 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 8111 {MO_HI12, "aarch64-hi12"}}; 8112 return ArrayRef(TargetFlags); 8113 } 8114 8115 ArrayRef<std::pair<unsigned, const char *>> 8116 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 8117 using namespace AArch64II; 8118 8119 static const std::pair<unsigned, const char *> TargetFlags[] = { 8120 {MO_COFFSTUB, "aarch64-coffstub"}, 8121 {MO_GOT, "aarch64-got"}, 8122 {MO_NC, "aarch64-nc"}, 8123 {MO_S, "aarch64-s"}, 8124 {MO_TLS, "aarch64-tls"}, 8125 {MO_DLLIMPORT, "aarch64-dllimport"}, 8126 {MO_PREL, "aarch64-prel"}, 8127 {MO_TAGGED, "aarch64-tagged"}, 8128 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"}, 8129 }; 8130 return ArrayRef(TargetFlags); 8131 } 8132 8133 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 8134 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 8135 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 8136 {{MOSuppressPair, "aarch64-suppress-pair"}, 8137 {MOStridedAccess, "aarch64-strided-access"}}; 8138 return ArrayRef(TargetFlags); 8139 } 8140 8141 /// Constants defining how certain sequences should be outlined. 8142 /// This encompasses how an outlined function should be called, and what kind of 8143 /// frame should be emitted for that outlined function. 8144 /// 8145 /// \p MachineOutlinerDefault implies that the function should be called with 8146 /// a save and restore of LR to the stack. 8147 /// 8148 /// That is, 8149 /// 8150 /// I1 Save LR OUTLINED_FUNCTION: 8151 /// I2 --> BL OUTLINED_FUNCTION I1 8152 /// I3 Restore LR I2 8153 /// I3 8154 /// RET 8155 /// 8156 /// * Call construction overhead: 3 (save + BL + restore) 8157 /// * Frame construction overhead: 1 (ret) 8158 /// * Requires stack fixups? Yes 8159 /// 8160 /// \p MachineOutlinerTailCall implies that the function is being created from 8161 /// a sequence of instructions ending in a return. 8162 /// 8163 /// That is, 8164 /// 8165 /// I1 OUTLINED_FUNCTION: 8166 /// I2 --> B OUTLINED_FUNCTION I1 8167 /// RET I2 8168 /// RET 8169 /// 8170 /// * Call construction overhead: 1 (B) 8171 /// * Frame construction overhead: 0 (Return included in sequence) 8172 /// * Requires stack fixups? No 8173 /// 8174 /// \p MachineOutlinerNoLRSave implies that the function should be called using 8175 /// a BL instruction, but doesn't require LR to be saved and restored. This 8176 /// happens when LR is known to be dead. 8177 /// 8178 /// That is, 8179 /// 8180 /// I1 OUTLINED_FUNCTION: 8181 /// I2 --> BL OUTLINED_FUNCTION I1 8182 /// I3 I2 8183 /// I3 8184 /// RET 8185 /// 8186 /// * Call construction overhead: 1 (BL) 8187 /// * Frame construction overhead: 1 (RET) 8188 /// * Requires stack fixups? No 8189 /// 8190 /// \p MachineOutlinerThunk implies that the function is being created from 8191 /// a sequence of instructions ending in a call. The outlined function is 8192 /// called with a BL instruction, and the outlined function tail-calls the 8193 /// original call destination. 8194 /// 8195 /// That is, 8196 /// 8197 /// I1 OUTLINED_FUNCTION: 8198 /// I2 --> BL OUTLINED_FUNCTION I1 8199 /// BL f I2 8200 /// B f 8201 /// * Call construction overhead: 1 (BL) 8202 /// * Frame construction overhead: 0 8203 /// * Requires stack fixups? No 8204 /// 8205 /// \p MachineOutlinerRegSave implies that the function should be called with a 8206 /// save and restore of LR to an available register. This allows us to avoid 8207 /// stack fixups. Note that this outlining variant is compatible with the 8208 /// NoLRSave case. 8209 /// 8210 /// That is, 8211 /// 8212 /// I1 Save LR OUTLINED_FUNCTION: 8213 /// I2 --> BL OUTLINED_FUNCTION I1 8214 /// I3 Restore LR I2 8215 /// I3 8216 /// RET 8217 /// 8218 /// * Call construction overhead: 3 (save + BL + restore) 8219 /// * Frame construction overhead: 1 (ret) 8220 /// * Requires stack fixups? No 8221 enum MachineOutlinerClass { 8222 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 8223 MachineOutlinerTailCall, /// Only emit a branch. 8224 MachineOutlinerNoLRSave, /// Emit a call and return. 8225 MachineOutlinerThunk, /// Emit a call and tail-call. 8226 MachineOutlinerRegSave /// Same as default, but save to a register. 8227 }; 8228 8229 enum MachineOutlinerMBBFlags { 8230 LRUnavailableSomewhere = 0x2, 8231 HasCalls = 0x4, 8232 UnsafeRegsDead = 0x8 8233 }; 8234 8235 Register 8236 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 8237 MachineFunction *MF = C.getMF(); 8238 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 8239 const AArch64RegisterInfo *ARI = 8240 static_cast<const AArch64RegisterInfo *>(&TRI); 8241 // Check if there is an available register across the sequence that we can 8242 // use. 8243 for (unsigned Reg : AArch64::GPR64RegClass) { 8244 if (!ARI->isReservedReg(*MF, Reg) && 8245 Reg != AArch64::LR && // LR is not reserved, but don't use it. 8246 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 8247 Reg != AArch64::X17 && // Ditto for X17. 8248 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 8249 C.isAvailableInsideSeq(Reg, TRI)) 8250 return Reg; 8251 } 8252 return Register(); 8253 } 8254 8255 static bool 8256 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 8257 const outliner::Candidate &b) { 8258 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8259 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8260 8261 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 8262 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 8263 } 8264 8265 static bool 8266 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 8267 const outliner::Candidate &b) { 8268 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8269 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8270 8271 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 8272 } 8273 8274 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 8275 const outliner::Candidate &b) { 8276 const AArch64Subtarget &SubtargetA = 8277 a.getMF()->getSubtarget<AArch64Subtarget>(); 8278 const AArch64Subtarget &SubtargetB = 8279 b.getMF()->getSubtarget<AArch64Subtarget>(); 8280 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 8281 } 8282 8283 std::optional<outliner::OutlinedFunction> 8284 AArch64InstrInfo::getOutliningCandidateInfo( 8285 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 8286 unsigned SequenceSize = 0; 8287 for (auto &MI : RepeatedSequenceLocs[0]) 8288 SequenceSize += getInstSizeInBytes(MI); 8289 8290 unsigned NumBytesToCreateFrame = 0; 8291 8292 // We only allow outlining for functions having exactly matching return 8293 // address signing attributes, i.e., all share the same value for the 8294 // attribute "sign-return-address" and all share the same type of key they 8295 // are signed with. 8296 // Additionally we require all functions to simultaniously either support 8297 // v8.3a features or not. Otherwise an outlined function could get signed 8298 // using dedicated v8.3 instructions and a call from a function that doesn't 8299 // support v8.3 instructions would therefore be invalid. 8300 if (std::adjacent_find( 8301 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 8302 [](const outliner::Candidate &a, const outliner::Candidate &b) { 8303 // Return true if a and b are non-equal w.r.t. return address 8304 // signing or support of v8.3a features 8305 if (outliningCandidatesSigningScopeConsensus(a, b) && 8306 outliningCandidatesSigningKeyConsensus(a, b) && 8307 outliningCandidatesV8_3OpsConsensus(a, b)) { 8308 return false; 8309 } 8310 return true; 8311 }) != RepeatedSequenceLocs.end()) { 8312 return std::nullopt; 8313 } 8314 8315 // Since at this point all candidates agree on their return address signing 8316 // picking just one is fine. If the candidate functions potentially sign their 8317 // return addresses, the outlined function should do the same. Note that in 8318 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 8319 // not certainly true that the outlined function will have to sign its return 8320 // address but this decision is made later, when the decision to outline 8321 // has already been made. 8322 // The same holds for the number of additional instructions we need: On 8323 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 8324 // necessary. However, at this point we don't know if the outlined function 8325 // will have a RET instruction so we assume the worst. 8326 const TargetRegisterInfo &TRI = getRegisterInfo(); 8327 // Performing a tail call may require extra checks when PAuth is enabled. 8328 // If PAuth is disabled, set it to zero for uniformity. 8329 unsigned NumBytesToCheckLRInTCEpilogue = 0; 8330 if (RepeatedSequenceLocs[0] 8331 .getMF() 8332 ->getInfo<AArch64FunctionInfo>() 8333 ->shouldSignReturnAddress(true)) { 8334 // One PAC and one AUT instructions 8335 NumBytesToCreateFrame += 8; 8336 8337 // PAuth is enabled - set extra tail call cost, if any. 8338 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod( 8339 *RepeatedSequenceLocs[0].getMF()); 8340 NumBytesToCheckLRInTCEpilogue = 8341 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod); 8342 // Checking the authenticated LR value may significantly impact 8343 // SequenceSize, so account for it for more precise results. 8344 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back())) 8345 SequenceSize += NumBytesToCheckLRInTCEpilogue; 8346 8347 // We have to check if sp modifying instructions would get outlined. 8348 // If so we only allow outlining if sp is unchanged overall, so matching 8349 // sub and add instructions are okay to outline, all other sp modifications 8350 // are not 8351 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 8352 int SPValue = 0; 8353 for (auto &MI : C) { 8354 if (MI.modifiesRegister(AArch64::SP, &TRI)) { 8355 switch (MI.getOpcode()) { 8356 case AArch64::ADDXri: 8357 case AArch64::ADDWri: 8358 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8359 assert(MI.getOperand(2).isImm() && 8360 "Expected operand to be immediate"); 8361 assert(MI.getOperand(1).isReg() && 8362 "Expected operand to be a register"); 8363 // Check if the add just increments sp. If so, we search for 8364 // matching sub instructions that decrement sp. If not, the 8365 // modification is illegal 8366 if (MI.getOperand(1).getReg() == AArch64::SP) 8367 SPValue += MI.getOperand(2).getImm(); 8368 else 8369 return true; 8370 break; 8371 case AArch64::SUBXri: 8372 case AArch64::SUBWri: 8373 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8374 assert(MI.getOperand(2).isImm() && 8375 "Expected operand to be immediate"); 8376 assert(MI.getOperand(1).isReg() && 8377 "Expected operand to be a register"); 8378 // Check if the sub just decrements sp. If so, we search for 8379 // matching add instructions that increment sp. If not, the 8380 // modification is illegal 8381 if (MI.getOperand(1).getReg() == AArch64::SP) 8382 SPValue -= MI.getOperand(2).getImm(); 8383 else 8384 return true; 8385 break; 8386 default: 8387 return true; 8388 } 8389 } 8390 } 8391 if (SPValue) 8392 return true; 8393 return false; 8394 }; 8395 // Remove candidates with illegal stack modifying instructions 8396 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 8397 8398 // If the sequence doesn't have enough candidates left, then we're done. 8399 if (RepeatedSequenceLocs.size() < 2) 8400 return std::nullopt; 8401 } 8402 8403 // Properties about candidate MBBs that hold for all of them. 8404 unsigned FlagsSetInAll = 0xF; 8405 8406 // Compute liveness information for each candidate, and set FlagsSetInAll. 8407 for (outliner::Candidate &C : RepeatedSequenceLocs) 8408 FlagsSetInAll &= C.Flags; 8409 8410 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode(); 8411 8412 // Helper lambda which sets call information for every candidate. 8413 auto SetCandidateCallInfo = 8414 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 8415 for (outliner::Candidate &C : RepeatedSequenceLocs) 8416 C.setCallInfo(CallID, NumBytesForCall); 8417 }; 8418 8419 unsigned FrameID = MachineOutlinerDefault; 8420 NumBytesToCreateFrame += 4; 8421 8422 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 8423 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 8424 }); 8425 8426 // We check to see if CFI Instructions are present, and if they are 8427 // we find the number of CFI Instructions in the candidates. 8428 unsigned CFICount = 0; 8429 for (auto &I : RepeatedSequenceLocs[0]) { 8430 if (I.isCFIInstruction()) 8431 CFICount++; 8432 } 8433 8434 // We compare the number of found CFI Instructions to the number of CFI 8435 // instructions in the parent function for each candidate. We must check this 8436 // since if we outline one of the CFI instructions in a function, we have to 8437 // outline them all for correctness. If we do not, the address offsets will be 8438 // incorrect between the two sections of the program. 8439 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8440 std::vector<MCCFIInstruction> CFIInstructions = 8441 C.getMF()->getFrameInstructions(); 8442 8443 if (CFICount > 0 && CFICount != CFIInstructions.size()) 8444 return std::nullopt; 8445 } 8446 8447 // Returns true if an instructions is safe to fix up, false otherwise. 8448 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 8449 if (MI.isCall()) 8450 return true; 8451 8452 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 8453 !MI.readsRegister(AArch64::SP, &TRI)) 8454 return true; 8455 8456 // Any modification of SP will break our code to save/restore LR. 8457 // FIXME: We could handle some instructions which add a constant 8458 // offset to SP, with a bit more work. 8459 if (MI.modifiesRegister(AArch64::SP, &TRI)) 8460 return false; 8461 8462 // At this point, we have a stack instruction that we might need to 8463 // fix up. We'll handle it if it's a load or store. 8464 if (MI.mayLoadOrStore()) { 8465 const MachineOperand *Base; // Filled with the base operand of MI. 8466 int64_t Offset; // Filled with the offset of MI. 8467 bool OffsetIsScalable; 8468 8469 // Does it allow us to offset the base operand and is the base the 8470 // register SP? 8471 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 8472 !Base->isReg() || Base->getReg() != AArch64::SP) 8473 return false; 8474 8475 // Fixe-up code below assumes bytes. 8476 if (OffsetIsScalable) 8477 return false; 8478 8479 // Find the minimum/maximum offset for this instruction and check 8480 // if fixing it up would be in range. 8481 int64_t MinOffset, 8482 MaxOffset; // Unscaled offsets for the instruction. 8483 // The scale to multiply the offsets by. 8484 TypeSize Scale(0U, false), DummyWidth(0U, false); 8485 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 8486 8487 Offset += 16; // Update the offset to what it would be if we outlined. 8488 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 8489 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 8490 return false; 8491 8492 // It's in range, so we can outline it. 8493 return true; 8494 } 8495 8496 // FIXME: Add handling for instructions like "add x0, sp, #8". 8497 8498 // We can't fix it up, so don't outline it. 8499 return false; 8500 }; 8501 8502 // True if it's possible to fix up each stack instruction in this sequence. 8503 // Important for frames/call variants that modify the stack. 8504 bool AllStackInstrsSafe = 8505 llvm::all_of(RepeatedSequenceLocs[0], IsSafeToFixup); 8506 8507 // If the last instruction in any candidate is a terminator, then we should 8508 // tail call all of the candidates. 8509 if (RepeatedSequenceLocs[0].back().isTerminator()) { 8510 FrameID = MachineOutlinerTailCall; 8511 NumBytesToCreateFrame = 0; 8512 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue; 8513 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall); 8514 } 8515 8516 else if (LastInstrOpcode == AArch64::BL || 8517 ((LastInstrOpcode == AArch64::BLR || 8518 LastInstrOpcode == AArch64::BLRNoIP) && 8519 !HasBTI)) { 8520 // FIXME: Do we need to check if the code after this uses the value of LR? 8521 FrameID = MachineOutlinerThunk; 8522 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue; 8523 SetCandidateCallInfo(MachineOutlinerThunk, 4); 8524 } 8525 8526 else { 8527 // We need to decide how to emit calls + frames. We can always emit the same 8528 // frame if we don't need to save to the stack. If we have to save to the 8529 // stack, then we need a different frame. 8530 unsigned NumBytesNoStackCalls = 0; 8531 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 8532 8533 // Check if we have to save LR. 8534 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8535 bool LRAvailable = 8536 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 8537 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 8538 : true; 8539 // If we have a noreturn caller, then we're going to be conservative and 8540 // say that we have to save LR. If we don't have a ret at the end of the 8541 // block, then we can't reason about liveness accurately. 8542 // 8543 // FIXME: We can probably do better than always disabling this in 8544 // noreturn functions by fixing up the liveness info. 8545 bool IsNoReturn = 8546 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 8547 8548 // Is LR available? If so, we don't need a save. 8549 if (LRAvailable && !IsNoReturn) { 8550 NumBytesNoStackCalls += 4; 8551 C.setCallInfo(MachineOutlinerNoLRSave, 4); 8552 CandidatesWithoutStackFixups.push_back(C); 8553 } 8554 8555 // Is an unused register available? If so, we won't modify the stack, so 8556 // we can outline with the same frame type as those that don't save LR. 8557 else if (findRegisterToSaveLRTo(C)) { 8558 NumBytesNoStackCalls += 12; 8559 C.setCallInfo(MachineOutlinerRegSave, 12); 8560 CandidatesWithoutStackFixups.push_back(C); 8561 } 8562 8563 // Is SP used in the sequence at all? If not, we don't have to modify 8564 // the stack, so we are guaranteed to get the same frame. 8565 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 8566 NumBytesNoStackCalls += 12; 8567 C.setCallInfo(MachineOutlinerDefault, 12); 8568 CandidatesWithoutStackFixups.push_back(C); 8569 } 8570 8571 // If we outline this, we need to modify the stack. Pretend we don't 8572 // outline this by saving all of its bytes. 8573 else { 8574 NumBytesNoStackCalls += SequenceSize; 8575 } 8576 } 8577 8578 // If there are no places where we have to save LR, then note that we 8579 // don't have to update the stack. Otherwise, give every candidate the 8580 // default call type, as long as it's safe to do so. 8581 if (!AllStackInstrsSafe || 8582 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 8583 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 8584 FrameID = MachineOutlinerNoLRSave; 8585 if (RepeatedSequenceLocs.size() < 2) 8586 return std::nullopt; 8587 } else { 8588 SetCandidateCallInfo(MachineOutlinerDefault, 12); 8589 8590 // Bugzilla ID: 46767 8591 // TODO: Check if fixing up the stack more than once is safe so we can 8592 // outline these. 8593 // 8594 // An outline resulting in a caller that requires stack fixups at the 8595 // callsite to a callee that also requires stack fixups can happen when 8596 // there are no available registers at the candidate callsite for a 8597 // candidate that itself also has calls. 8598 // 8599 // In other words if function_containing_sequence in the following pseudo 8600 // assembly requires that we save LR at the point of the call, but there 8601 // are no available registers: in this case we save using SP and as a 8602 // result the SP offsets requires stack fixups by multiples of 16. 8603 // 8604 // function_containing_sequence: 8605 // ... 8606 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8607 // call OUTLINED_FUNCTION_N 8608 // restore LR from SP 8609 // ... 8610 // 8611 // OUTLINED_FUNCTION_N: 8612 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8613 // ... 8614 // bl foo 8615 // restore LR from SP 8616 // ret 8617 // 8618 // Because the code to handle more than one stack fixup does not 8619 // currently have the proper checks for legality, these cases will assert 8620 // in the AArch64 MachineOutliner. This is because the code to do this 8621 // needs more hardening, testing, better checks that generated code is 8622 // legal, etc and because it is only verified to handle a single pass of 8623 // stack fixup. 8624 // 8625 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 8626 // these cases until they are known to be handled. Bugzilla 46767 is 8627 // referenced in comments at the assert site. 8628 // 8629 // To avoid asserting (or generating non-legal code on noassert builds) 8630 // we remove all candidates which would need more than one stack fixup by 8631 // pruning the cases where the candidate has calls while also having no 8632 // available LR and having no available general purpose registers to copy 8633 // LR to (ie one extra stack save/restore). 8634 // 8635 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8636 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 8637 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); }; 8638 return (llvm::any_of(C, IsCall)) && 8639 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 8640 !findRegisterToSaveLRTo(C)); 8641 }); 8642 } 8643 } 8644 8645 // If we dropped all of the candidates, bail out here. 8646 if (RepeatedSequenceLocs.size() < 2) { 8647 RepeatedSequenceLocs.clear(); 8648 return std::nullopt; 8649 } 8650 } 8651 8652 // Does every candidate's MBB contain a call? If so, then we might have a call 8653 // in the range. 8654 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8655 // Check if the range contains a call. These require a save + restore of the 8656 // link register. 8657 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 8658 bool ModStackToSaveLR = false; 8659 if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()), 8660 [](const MachineInstr &MI) { return MI.isCall(); })) 8661 ModStackToSaveLR = true; 8662 8663 // Handle the last instruction separately. If this is a tail call, then the 8664 // last instruction is a call. We don't want to save + restore in this case. 8665 // However, it could be possible that the last instruction is a call without 8666 // it being valid to tail call this sequence. We should consider this as 8667 // well. 8668 else if (FrameID != MachineOutlinerThunk && 8669 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall()) 8670 ModStackToSaveLR = true; 8671 8672 if (ModStackToSaveLR) { 8673 // We can't fix up the stack. Bail out. 8674 if (!AllStackInstrsSafe) { 8675 RepeatedSequenceLocs.clear(); 8676 return std::nullopt; 8677 } 8678 8679 // Save + restore LR. 8680 NumBytesToCreateFrame += 8; 8681 } 8682 } 8683 8684 // If we have CFI instructions, we can only outline if the outlined section 8685 // can be a tail call 8686 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 8687 return std::nullopt; 8688 8689 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 8690 NumBytesToCreateFrame, FrameID); 8691 } 8692 8693 void AArch64InstrInfo::mergeOutliningCandidateAttributes( 8694 Function &F, std::vector<outliner::Candidate> &Candidates) const { 8695 // If a bunch of candidates reach this point they must agree on their return 8696 // address signing. It is therefore enough to just consider the signing 8697 // behaviour of one of them 8698 const auto &CFn = Candidates.front().getMF()->getFunction(); 8699 8700 if (CFn.hasFnAttribute("ptrauth-returns")) 8701 F.addFnAttr(CFn.getFnAttribute("ptrauth-returns")); 8702 if (CFn.hasFnAttribute("ptrauth-auth-traps")) 8703 F.addFnAttr(CFn.getFnAttribute("ptrauth-auth-traps")); 8704 // Since all candidates belong to the same module, just copy the 8705 // function-level attributes of an arbitrary function. 8706 if (CFn.hasFnAttribute("sign-return-address")) 8707 F.addFnAttr(CFn.getFnAttribute("sign-return-address")); 8708 if (CFn.hasFnAttribute("sign-return-address-key")) 8709 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key")); 8710 8711 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates); 8712 } 8713 8714 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 8715 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 8716 const Function &F = MF.getFunction(); 8717 8718 // Can F be deduplicated by the linker? If it can, don't outline from it. 8719 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 8720 return false; 8721 8722 // Don't outline from functions with section markings; the program could 8723 // expect that all the code is in the named section. 8724 // FIXME: Allow outlining from multiple functions with the same section 8725 // marking. 8726 if (F.hasSection()) 8727 return false; 8728 8729 // Outlining from functions with redzones is unsafe since the outliner may 8730 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 8731 // outline from it. 8732 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 8733 if (!AFI || AFI->hasRedZone().value_or(true)) 8734 return false; 8735 8736 // FIXME: Determine whether it is safe to outline from functions which contain 8737 // streaming-mode changes. We may need to ensure any smstart/smstop pairs are 8738 // outlined together and ensure it is safe to outline with async unwind info, 8739 // required for saving & restoring VG around calls. 8740 if (AFI->hasStreamingModeChanges()) 8741 return false; 8742 8743 // FIXME: Teach the outliner to generate/handle Windows unwind info. 8744 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 8745 return false; 8746 8747 // It's safe to outline from MF. 8748 return true; 8749 } 8750 8751 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8752 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 8753 unsigned &Flags) const { 8754 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 8755 "Must track liveness!"); 8756 SmallVector< 8757 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8758 Ranges; 8759 // According to the AArch64 Procedure Call Standard, the following are 8760 // undefined on entry/exit from a function call: 8761 // 8762 // * Registers x16, x17, (and thus w16, w17) 8763 // * Condition codes (and thus the NZCV register) 8764 // 8765 // If any of these registers are used inside or live across an outlined 8766 // function, then they may be modified later, either by the compiler or 8767 // some other tool (like the linker). 8768 // 8769 // To avoid outlining in these situations, partition each block into ranges 8770 // where these registers are dead. We will only outline from those ranges. 8771 LiveRegUnits LRU(getRegisterInfo()); 8772 auto AreAllUnsafeRegsDead = [&LRU]() { 8773 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 8774 LRU.available(AArch64::NZCV); 8775 }; 8776 8777 // We need to know if LR is live across an outlining boundary later on in 8778 // order to decide how we'll create the outlined call, frame, etc. 8779 // 8780 // It's pretty expensive to check this for *every candidate* within a block. 8781 // That's some potentially n^2 behaviour, since in the worst case, we'd need 8782 // to compute liveness from the end of the block for O(n) candidates within 8783 // the block. 8784 // 8785 // So, to improve the average case, let's keep track of liveness from the end 8786 // of the block to the beginning of *every outlinable range*. If we know that 8787 // LR is available in every range we could outline from, then we know that 8788 // we don't need to check liveness for any candidate within that range. 8789 bool LRAvailableEverywhere = true; 8790 // Compute liveness bottom-up. 8791 LRU.addLiveOuts(MBB); 8792 // Update flags that require info about the entire MBB. 8793 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 8794 if (MI.isCall() && !MI.isTerminator()) 8795 Flags |= MachineOutlinerMBBFlags::HasCalls; 8796 }; 8797 // Range: [RangeBegin, RangeEnd) 8798 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 8799 unsigned RangeLen; 8800 auto CreateNewRangeStartingAt = 8801 [&RangeBegin, &RangeEnd, 8802 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 8803 RangeBegin = NewBegin; 8804 RangeEnd = std::next(RangeBegin); 8805 RangeLen = 0; 8806 }; 8807 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 8808 // At least one unsafe register is not dead. We do not want to outline at 8809 // this point. If it is long enough to outline from, save the range 8810 // [RangeBegin, RangeEnd). 8811 if (RangeLen > 1) 8812 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 8813 }; 8814 // Find the first point where all unsafe registers are dead. 8815 // FIND: <safe instr> <-- end of first potential range 8816 // SKIP: <unsafe def> 8817 // SKIP: ... everything between ... 8818 // SKIP: <unsafe use> 8819 auto FirstPossibleEndPt = MBB.instr_rbegin(); 8820 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 8821 LRU.stepBackward(*FirstPossibleEndPt); 8822 // Update flags that impact how we outline across the entire block, 8823 // regardless of safety. 8824 UpdateWholeMBBFlags(*FirstPossibleEndPt); 8825 if (AreAllUnsafeRegsDead()) 8826 break; 8827 } 8828 // If we exhausted the entire block, we have no safe ranges to outline. 8829 if (FirstPossibleEndPt == MBB.instr_rend()) 8830 return Ranges; 8831 // Current range. 8832 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 8833 // StartPt points to the first place where all unsafe registers 8834 // are dead (if there is any such point). Begin partitioning the MBB into 8835 // ranges. 8836 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 8837 LRU.stepBackward(MI); 8838 UpdateWholeMBBFlags(MI); 8839 if (!AreAllUnsafeRegsDead()) { 8840 SaveRangeIfNonEmpty(); 8841 CreateNewRangeStartingAt(MI.getIterator()); 8842 continue; 8843 } 8844 LRAvailableEverywhere &= LRU.available(AArch64::LR); 8845 RangeBegin = MI.getIterator(); 8846 ++RangeLen; 8847 } 8848 // Above loop misses the last (or only) range. If we are still safe, then 8849 // let's save the range. 8850 if (AreAllUnsafeRegsDead()) 8851 SaveRangeIfNonEmpty(); 8852 if (Ranges.empty()) 8853 return Ranges; 8854 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 8855 // the order. 8856 std::reverse(Ranges.begin(), Ranges.end()); 8857 // If there is at least one outlinable range where LR is unavailable 8858 // somewhere, remember that. 8859 if (!LRAvailableEverywhere) 8860 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 8861 return Ranges; 8862 } 8863 8864 outliner::InstrType 8865 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 8866 unsigned Flags) const { 8867 MachineInstr &MI = *MIT; 8868 MachineBasicBlock *MBB = MI.getParent(); 8869 MachineFunction *MF = MBB->getParent(); 8870 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 8871 8872 // Don't outline anything used for return address signing. The outlined 8873 // function will get signed later if needed 8874 switch (MI.getOpcode()) { 8875 case AArch64::PACM: 8876 case AArch64::PACIASP: 8877 case AArch64::PACIBSP: 8878 case AArch64::PACIASPPC: 8879 case AArch64::PACIBSPPC: 8880 case AArch64::AUTIASP: 8881 case AArch64::AUTIBSP: 8882 case AArch64::AUTIASPPCi: 8883 case AArch64::AUTIASPPCr: 8884 case AArch64::AUTIBSPPCi: 8885 case AArch64::AUTIBSPPCr: 8886 case AArch64::RETAA: 8887 case AArch64::RETAB: 8888 case AArch64::RETAASPPCi: 8889 case AArch64::RETAASPPCr: 8890 case AArch64::RETABSPPCi: 8891 case AArch64::RETABSPPCr: 8892 case AArch64::EMITBKEY: 8893 case AArch64::PAUTH_PROLOGUE: 8894 case AArch64::PAUTH_EPILOGUE: 8895 return outliner::InstrType::Illegal; 8896 } 8897 8898 // Don't outline LOHs. 8899 if (FuncInfo->getLOHRelated().count(&MI)) 8900 return outliner::InstrType::Illegal; 8901 8902 // We can only outline these if we will tail call the outlined function, or 8903 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 8904 // in a tail call. 8905 // 8906 // FIXME: If the proper fixups for the offset are implemented, this should be 8907 // possible. 8908 if (MI.isCFIInstruction()) 8909 return outliner::InstrType::Legal; 8910 8911 // Is this a terminator for a basic block? 8912 if (MI.isTerminator()) 8913 // TargetInstrInfo::getOutliningType has already filtered out anything 8914 // that would break this, so we can allow it here. 8915 return outliner::InstrType::Legal; 8916 8917 // Make sure none of the operands are un-outlinable. 8918 for (const MachineOperand &MOP : MI.operands()) { 8919 // A check preventing CFI indices was here before, but only CFI 8920 // instructions should have those. 8921 assert(!MOP.isCFIIndex()); 8922 8923 // If it uses LR or W30 explicitly, then don't touch it. 8924 if (MOP.isReg() && !MOP.isImplicit() && 8925 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 8926 return outliner::InstrType::Illegal; 8927 } 8928 8929 // Special cases for instructions that can always be outlined, but will fail 8930 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 8931 // be outlined because they don't require a *specific* value to be in LR. 8932 if (MI.getOpcode() == AArch64::ADRP) 8933 return outliner::InstrType::Legal; 8934 8935 // If MI is a call we might be able to outline it. We don't want to outline 8936 // any calls that rely on the position of items on the stack. When we outline 8937 // something containing a call, we have to emit a save and restore of LR in 8938 // the outlined function. Currently, this always happens by saving LR to the 8939 // stack. Thus, if we outline, say, half the parameters for a function call 8940 // plus the call, then we'll break the callee's expectations for the layout 8941 // of the stack. 8942 // 8943 // FIXME: Allow calls to functions which construct a stack frame, as long 8944 // as they don't access arguments on the stack. 8945 // FIXME: Figure out some way to analyze functions defined in other modules. 8946 // We should be able to compute the memory usage based on the IR calling 8947 // convention, even if we can't see the definition. 8948 if (MI.isCall()) { 8949 // Get the function associated with the call. Look at each operand and find 8950 // the one that represents the callee and get its name. 8951 const Function *Callee = nullptr; 8952 for (const MachineOperand &MOP : MI.operands()) { 8953 if (MOP.isGlobal()) { 8954 Callee = dyn_cast<Function>(MOP.getGlobal()); 8955 break; 8956 } 8957 } 8958 8959 // Never outline calls to mcount. There isn't any rule that would require 8960 // this, but the Linux kernel's "ftrace" feature depends on it. 8961 if (Callee && Callee->getName() == "\01_mcount") 8962 return outliner::InstrType::Illegal; 8963 8964 // If we don't know anything about the callee, assume it depends on the 8965 // stack layout of the caller. In that case, it's only legal to outline 8966 // as a tail-call. Explicitly list the call instructions we know about so we 8967 // don't get unexpected results with call pseudo-instructions. 8968 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 8969 if (MI.getOpcode() == AArch64::BLR || 8970 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 8971 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 8972 8973 if (!Callee) 8974 return UnknownCallOutlineType; 8975 8976 // We have a function we have information about. Check it if it's something 8977 // can safely outline. 8978 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 8979 8980 // We don't know what's going on with the callee at all. Don't touch it. 8981 if (!CalleeMF) 8982 return UnknownCallOutlineType; 8983 8984 // Check if we know anything about the callee saves on the function. If we 8985 // don't, then don't touch it, since that implies that we haven't 8986 // computed anything about its stack frame yet. 8987 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 8988 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 8989 MFI.getNumObjects() > 0) 8990 return UnknownCallOutlineType; 8991 8992 // At this point, we can say that CalleeMF ought to not pass anything on the 8993 // stack. Therefore, we can outline it. 8994 return outliner::InstrType::Legal; 8995 } 8996 8997 // Don't touch the link register or W30. 8998 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 8999 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 9000 return outliner::InstrType::Illegal; 9001 9002 // Don't outline BTI instructions, because that will prevent the outlining 9003 // site from being indirectly callable. 9004 if (hasBTISemantics(MI)) 9005 return outliner::InstrType::Illegal; 9006 9007 return outliner::InstrType::Legal; 9008 } 9009 9010 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 9011 for (MachineInstr &MI : MBB) { 9012 const MachineOperand *Base; 9013 TypeSize Width(0, false); 9014 int64_t Offset; 9015 bool OffsetIsScalable; 9016 9017 // Is this a load or store with an immediate offset with SP as the base? 9018 if (!MI.mayLoadOrStore() || 9019 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 9020 &RI) || 9021 (Base->isReg() && Base->getReg() != AArch64::SP)) 9022 continue; 9023 9024 // It is, so we have to fix it up. 9025 TypeSize Scale(0U, false); 9026 int64_t Dummy1, Dummy2; 9027 9028 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 9029 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 9030 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 9031 assert(Scale != 0 && "Unexpected opcode!"); 9032 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 9033 9034 // We've pushed the return address to the stack, so add 16 to the offset. 9035 // This is safe, since we already checked if it would overflow when we 9036 // checked if this instruction was legal to outline. 9037 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 9038 StackOffsetOperand.setImm(NewImm); 9039 } 9040 } 9041 9042 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 9043 const AArch64InstrInfo *TII, 9044 bool ShouldSignReturnAddr) { 9045 if (!ShouldSignReturnAddr) 9046 return; 9047 9048 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE)) 9049 .setMIFlag(MachineInstr::FrameSetup); 9050 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(), 9051 TII->get(AArch64::PAUTH_EPILOGUE)) 9052 .setMIFlag(MachineInstr::FrameDestroy); 9053 } 9054 9055 void AArch64InstrInfo::buildOutlinedFrame( 9056 MachineBasicBlock &MBB, MachineFunction &MF, 9057 const outliner::OutlinedFunction &OF) const { 9058 9059 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 9060 9061 if (OF.FrameConstructionID == MachineOutlinerTailCall) 9062 FI->setOutliningStyle("Tail Call"); 9063 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 9064 // For thunk outlining, rewrite the last instruction from a call to a 9065 // tail-call. 9066 MachineInstr *Call = &*--MBB.instr_end(); 9067 unsigned TailOpcode; 9068 if (Call->getOpcode() == AArch64::BL) { 9069 TailOpcode = AArch64::TCRETURNdi; 9070 } else { 9071 assert(Call->getOpcode() == AArch64::BLR || 9072 Call->getOpcode() == AArch64::BLRNoIP); 9073 TailOpcode = AArch64::TCRETURNriALL; 9074 } 9075 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 9076 .add(Call->getOperand(0)) 9077 .addImm(0); 9078 MBB.insert(MBB.end(), TC); 9079 Call->eraseFromParent(); 9080 9081 FI->setOutliningStyle("Thunk"); 9082 } 9083 9084 bool IsLeafFunction = true; 9085 9086 // Is there a call in the outlined range? 9087 auto IsNonTailCall = [](const MachineInstr &MI) { 9088 return MI.isCall() && !MI.isReturn(); 9089 }; 9090 9091 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 9092 // Fix up the instructions in the range, since we're going to modify the 9093 // stack. 9094 9095 // Bugzilla ID: 46767 9096 // TODO: Check if fixing up twice is safe so we can outline these. 9097 assert(OF.FrameConstructionID != MachineOutlinerDefault && 9098 "Can only fix up stack references once"); 9099 fixupPostOutline(MBB); 9100 9101 IsLeafFunction = false; 9102 9103 // LR has to be a live in so that we can save it. 9104 if (!MBB.isLiveIn(AArch64::LR)) 9105 MBB.addLiveIn(AArch64::LR); 9106 9107 MachineBasicBlock::iterator It = MBB.begin(); 9108 MachineBasicBlock::iterator Et = MBB.end(); 9109 9110 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9111 OF.FrameConstructionID == MachineOutlinerThunk) 9112 Et = std::prev(MBB.end()); 9113 9114 // Insert a save before the outlined region 9115 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9116 .addReg(AArch64::SP, RegState::Define) 9117 .addReg(AArch64::LR) 9118 .addReg(AArch64::SP) 9119 .addImm(-16); 9120 It = MBB.insert(It, STRXpre); 9121 9122 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 9123 const TargetSubtargetInfo &STI = MF.getSubtarget(); 9124 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 9125 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 9126 9127 // Add a CFI saying the stack was moved 16 B down. 9128 int64_t StackPosEntry = 9129 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 9130 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9131 .addCFIIndex(StackPosEntry) 9132 .setMIFlags(MachineInstr::FrameSetup); 9133 9134 // Add a CFI saying that the LR that we want to find is now 16 B higher 9135 // than before. 9136 int64_t LRPosEntry = MF.addFrameInst( 9137 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 9138 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9139 .addCFIIndex(LRPosEntry) 9140 .setMIFlags(MachineInstr::FrameSetup); 9141 } 9142 9143 // Insert a restore before the terminator for the function. 9144 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9145 .addReg(AArch64::SP, RegState::Define) 9146 .addReg(AArch64::LR, RegState::Define) 9147 .addReg(AArch64::SP) 9148 .addImm(16); 9149 Et = MBB.insert(Et, LDRXpost); 9150 } 9151 9152 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction); 9153 9154 // If this is a tail call outlined function, then there's already a return. 9155 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9156 OF.FrameConstructionID == MachineOutlinerThunk) { 9157 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9158 return; 9159 } 9160 9161 // It's not a tail call, so we have to insert the return ourselves. 9162 9163 // LR has to be a live in so that we can return to it. 9164 if (!MBB.isLiveIn(AArch64::LR)) 9165 MBB.addLiveIn(AArch64::LR); 9166 9167 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 9168 .addReg(AArch64::LR); 9169 MBB.insert(MBB.end(), ret); 9170 9171 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9172 9173 FI->setOutliningStyle("Function"); 9174 9175 // Did we have to modify the stack by saving the link register? 9176 if (OF.FrameConstructionID != MachineOutlinerDefault) 9177 return; 9178 9179 // We modified the stack. 9180 // Walk over the basic block and fix up all the stack accesses. 9181 fixupPostOutline(MBB); 9182 } 9183 9184 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 9185 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 9186 MachineFunction &MF, outliner::Candidate &C) const { 9187 9188 // Are we tail calling? 9189 if (C.CallConstructionID == MachineOutlinerTailCall) { 9190 // If yes, then we can just branch to the label. 9191 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 9192 .addGlobalAddress(M.getNamedValue(MF.getName())) 9193 .addImm(0)); 9194 return It; 9195 } 9196 9197 // Are we saving the link register? 9198 if (C.CallConstructionID == MachineOutlinerNoLRSave || 9199 C.CallConstructionID == MachineOutlinerThunk) { 9200 // No, so just insert the call. 9201 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9202 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9203 return It; 9204 } 9205 9206 // We want to return the spot where we inserted the call. 9207 MachineBasicBlock::iterator CallPt; 9208 9209 // Instructions for saving and restoring LR around the call instruction we're 9210 // going to insert. 9211 MachineInstr *Save; 9212 MachineInstr *Restore; 9213 // Can we save to a register? 9214 if (C.CallConstructionID == MachineOutlinerRegSave) { 9215 // FIXME: This logic should be sunk into a target-specific interface so that 9216 // we don't have to recompute the register. 9217 Register Reg = findRegisterToSaveLRTo(C); 9218 assert(Reg && "No callee-saved register available?"); 9219 9220 // LR has to be a live in so that we can save it. 9221 if (!MBB.isLiveIn(AArch64::LR)) 9222 MBB.addLiveIn(AArch64::LR); 9223 9224 // Save and restore LR from Reg. 9225 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 9226 .addReg(AArch64::XZR) 9227 .addReg(AArch64::LR) 9228 .addImm(0); 9229 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 9230 .addReg(AArch64::XZR) 9231 .addReg(Reg) 9232 .addImm(0); 9233 } else { 9234 // We have the default case. Save and restore from SP. 9235 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9236 .addReg(AArch64::SP, RegState::Define) 9237 .addReg(AArch64::LR) 9238 .addReg(AArch64::SP) 9239 .addImm(-16); 9240 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9241 .addReg(AArch64::SP, RegState::Define) 9242 .addReg(AArch64::LR, RegState::Define) 9243 .addReg(AArch64::SP) 9244 .addImm(16); 9245 } 9246 9247 It = MBB.insert(It, Save); 9248 It++; 9249 9250 // Insert the call. 9251 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9252 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9253 CallPt = It; 9254 It++; 9255 9256 It = MBB.insert(It, Restore); 9257 return CallPt; 9258 } 9259 9260 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 9261 MachineFunction &MF) const { 9262 return MF.getFunction().hasMinSize(); 9263 } 9264 9265 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, 9266 MachineBasicBlock::iterator Iter, 9267 DebugLoc &DL, 9268 bool AllowSideEffects) const { 9269 const MachineFunction &MF = *MBB.getParent(); 9270 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); 9271 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); 9272 9273 if (TRI.isGeneralPurposeRegister(MF, Reg)) { 9274 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0); 9275 } else if (STI.hasSVE()) { 9276 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg) 9277 .addImm(0) 9278 .addImm(0); 9279 } else { 9280 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg) 9281 .addImm(0); 9282 } 9283 } 9284 9285 std::optional<DestSourcePair> 9286 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 9287 9288 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 9289 // and zero immediate operands used as an alias for mov instruction. 9290 if (MI.getOpcode() == AArch64::ORRWrs && 9291 MI.getOperand(1).getReg() == AArch64::WZR && 9292 MI.getOperand(3).getImm() == 0x0 && 9293 // Check that the w->w move is not a zero-extending w->x mov. 9294 (!MI.getOperand(0).getReg().isVirtual() || 9295 MI.getOperand(0).getSubReg() == 0) && 9296 (!MI.getOperand(0).getReg().isPhysical() || 9297 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 + 9298 AArch64::X0, 9299 /*TRI=*/nullptr) == -1)) 9300 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9301 9302 if (MI.getOpcode() == AArch64::ORRXrs && 9303 MI.getOperand(1).getReg() == AArch64::XZR && 9304 MI.getOperand(3).getImm() == 0x0) 9305 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9306 9307 return std::nullopt; 9308 } 9309 9310 std::optional<DestSourcePair> 9311 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const { 9312 if (MI.getOpcode() == AArch64::ORRWrs && 9313 MI.getOperand(1).getReg() == AArch64::WZR && 9314 MI.getOperand(3).getImm() == 0x0) 9315 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9316 return std::nullopt; 9317 } 9318 9319 std::optional<RegImmPair> 9320 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 9321 int Sign = 1; 9322 int64_t Offset = 0; 9323 9324 // TODO: Handle cases where Reg is a super- or sub-register of the 9325 // destination register. 9326 const MachineOperand &Op0 = MI.getOperand(0); 9327 if (!Op0.isReg() || Reg != Op0.getReg()) 9328 return std::nullopt; 9329 9330 switch (MI.getOpcode()) { 9331 default: 9332 return std::nullopt; 9333 case AArch64::SUBWri: 9334 case AArch64::SUBXri: 9335 case AArch64::SUBSWri: 9336 case AArch64::SUBSXri: 9337 Sign *= -1; 9338 [[fallthrough]]; 9339 case AArch64::ADDSWri: 9340 case AArch64::ADDSXri: 9341 case AArch64::ADDWri: 9342 case AArch64::ADDXri: { 9343 // TODO: Third operand can be global address (usually some string). 9344 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 9345 !MI.getOperand(2).isImm()) 9346 return std::nullopt; 9347 int Shift = MI.getOperand(3).getImm(); 9348 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 9349 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 9350 } 9351 } 9352 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 9353 } 9354 9355 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 9356 /// the destination register then, if possible, describe the value in terms of 9357 /// the source register. 9358 static std::optional<ParamLoadedValue> 9359 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 9360 const TargetInstrInfo *TII, 9361 const TargetRegisterInfo *TRI) { 9362 auto DestSrc = TII->isCopyLikeInstr(MI); 9363 if (!DestSrc) 9364 return std::nullopt; 9365 9366 Register DestReg = DestSrc->Destination->getReg(); 9367 Register SrcReg = DestSrc->Source->getReg(); 9368 9369 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 9370 9371 // If the described register is the destination, just return the source. 9372 if (DestReg == DescribedReg) 9373 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9374 9375 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 9376 if (MI.getOpcode() == AArch64::ORRWrs && 9377 TRI->isSuperRegister(DestReg, DescribedReg)) 9378 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9379 9380 // We may need to describe the lower part of a ORRXrs move. 9381 if (MI.getOpcode() == AArch64::ORRXrs && 9382 TRI->isSubRegister(DestReg, DescribedReg)) { 9383 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 9384 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 9385 } 9386 9387 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 9388 "Unhandled ORR[XW]rs copy case"); 9389 9390 return std::nullopt; 9391 } 9392 9393 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const { 9394 // Functions cannot be split to different sections on AArch64 if they have 9395 // a red zone. This is because relaxing a cross-section branch may require 9396 // incrementing the stack pointer to spill a register, which would overwrite 9397 // the red zone. 9398 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true)) 9399 return false; 9400 9401 return TargetInstrInfo::isFunctionSafeToSplit(MF); 9402 } 9403 9404 bool AArch64InstrInfo::isMBBSafeToSplitToCold( 9405 const MachineBasicBlock &MBB) const { 9406 // Asm Goto blocks can contain conditional branches to goto labels, which can 9407 // get moved out of range of the branch instruction. 9408 auto isAsmGoto = [](const MachineInstr &MI) { 9409 return MI.getOpcode() == AArch64::INLINEASM_BR; 9410 }; 9411 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget()) 9412 return false; 9413 9414 // Because jump tables are label-relative instead of table-relative, they all 9415 // must be in the same section or relocation fixup handling will fail. 9416 9417 // Check if MBB is a jump table target 9418 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo(); 9419 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) { 9420 return llvm::is_contained(JTE.MBBs, &MBB); 9421 }; 9422 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB)) 9423 return false; 9424 9425 // Check if MBB contains a jump table lookup 9426 for (const MachineInstr &MI : MBB) { 9427 switch (MI.getOpcode()) { 9428 case TargetOpcode::G_BRJT: 9429 case AArch64::JumpTableDest32: 9430 case AArch64::JumpTableDest16: 9431 case AArch64::JumpTableDest8: 9432 return false; 9433 default: 9434 continue; 9435 } 9436 } 9437 9438 // MBB isn't a special case, so it's safe to be split to the cold section. 9439 return true; 9440 } 9441 9442 std::optional<ParamLoadedValue> 9443 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 9444 Register Reg) const { 9445 const MachineFunction *MF = MI.getMF(); 9446 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 9447 switch (MI.getOpcode()) { 9448 case AArch64::MOVZWi: 9449 case AArch64::MOVZXi: { 9450 // MOVZWi may be used for producing zero-extended 32-bit immediates in 9451 // 64-bit parameters, so we need to consider super-registers. 9452 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 9453 return std::nullopt; 9454 9455 if (!MI.getOperand(1).isImm()) 9456 return std::nullopt; 9457 int64_t Immediate = MI.getOperand(1).getImm(); 9458 int Shift = MI.getOperand(2).getImm(); 9459 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 9460 nullptr); 9461 } 9462 case AArch64::ORRWrs: 9463 case AArch64::ORRXrs: 9464 return describeORRLoadedValue(MI, Reg, this, TRI); 9465 } 9466 9467 return TargetInstrInfo::describeLoadedValue(MI, Reg); 9468 } 9469 9470 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 9471 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 9472 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 9473 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 9474 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 9475 9476 // Anyexts are nops. 9477 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 9478 return true; 9479 9480 Register DefReg = ExtMI.getOperand(0).getReg(); 9481 if (!MRI.hasOneNonDBGUse(DefReg)) 9482 return false; 9483 9484 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 9485 // addressing mode. 9486 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 9487 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 9488 } 9489 9490 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 9491 return get(Opc).TSFlags & AArch64::ElementSizeMask; 9492 } 9493 9494 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 9495 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 9496 } 9497 9498 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 9499 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 9500 } 9501 9502 unsigned int 9503 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const { 9504 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2; 9505 } 9506 9507 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset, 9508 unsigned Scale) const { 9509 if (Offset && Scale) 9510 return false; 9511 9512 // Check Reg + Imm 9513 if (!Scale) { 9514 // 9-bit signed offset 9515 if (isInt<9>(Offset)) 9516 return true; 9517 9518 // 12-bit unsigned offset 9519 unsigned Shift = Log2_64(NumBytes); 9520 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 9521 // Must be a multiple of NumBytes (NumBytes is a power of 2) 9522 (Offset >> Shift) << Shift == Offset) 9523 return true; 9524 return false; 9525 } 9526 9527 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 9528 return Scale == 1 || (Scale > 0 && Scale == NumBytes); 9529 } 9530 9531 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 9532 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 9533 return AArch64::BLRNoIP; 9534 else 9535 return AArch64::BLR; 9536 } 9537 9538 MachineBasicBlock::iterator 9539 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, 9540 Register TargetReg, bool FrameSetup) const { 9541 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); 9542 9543 MachineBasicBlock &MBB = *MBBI->getParent(); 9544 MachineFunction &MF = *MBB.getParent(); 9545 const AArch64InstrInfo *TII = 9546 MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); 9547 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize(); 9548 DebugLoc DL = MBB.findDebugLoc(MBBI); 9549 9550 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); 9551 MachineBasicBlock *LoopTestMBB = 9552 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9553 MF.insert(MBBInsertPoint, LoopTestMBB); 9554 MachineBasicBlock *LoopBodyMBB = 9555 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9556 MF.insert(MBBInsertPoint, LoopBodyMBB); 9557 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9558 MF.insert(MBBInsertPoint, ExitMBB); 9559 MachineInstr::MIFlag Flags = 9560 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; 9561 9562 // LoopTest: 9563 // SUB SP, SP, #ProbeSize 9564 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, 9565 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); 9566 9567 // CMP SP, TargetReg 9568 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), 9569 AArch64::XZR) 9570 .addReg(AArch64::SP) 9571 .addReg(TargetReg) 9572 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) 9573 .setMIFlags(Flags); 9574 9575 // B.<Cond> LoopExit 9576 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) 9577 .addImm(AArch64CC::LE) 9578 .addMBB(ExitMBB) 9579 .setMIFlags(Flags); 9580 9581 // STR XZR, [SP] 9582 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) 9583 .addReg(AArch64::XZR) 9584 .addReg(AArch64::SP) 9585 .addImm(0) 9586 .setMIFlags(Flags); 9587 9588 // B loop 9589 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) 9590 .addMBB(LoopTestMBB) 9591 .setMIFlags(Flags); 9592 9593 // LoopExit: 9594 // MOV SP, TargetReg 9595 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) 9596 .addReg(TargetReg) 9597 .addImm(0) 9598 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 9599 .setMIFlags(Flags); 9600 9601 // LDR XZR, [SP] 9602 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui)) 9603 .addReg(AArch64::XZR, RegState::Define) 9604 .addReg(AArch64::SP) 9605 .addImm(0) 9606 .setMIFlags(Flags); 9607 9608 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); 9609 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); 9610 9611 LoopTestMBB->addSuccessor(ExitMBB); 9612 LoopTestMBB->addSuccessor(LoopBodyMBB); 9613 LoopBodyMBB->addSuccessor(LoopTestMBB); 9614 MBB.addSuccessor(LoopTestMBB); 9615 9616 // Update liveins. 9617 if (MF.getRegInfo().reservedRegsFrozen()) 9618 fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB}); 9619 9620 return ExitMBB->begin(); 9621 } 9622 9623 namespace { 9624 class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { 9625 MachineFunction *MF; 9626 const TargetInstrInfo *TII; 9627 const TargetRegisterInfo *TRI; 9628 MachineRegisterInfo &MRI; 9629 9630 /// The block of the loop 9631 MachineBasicBlock *LoopBB; 9632 /// The conditional branch of the loop 9633 MachineInstr *CondBranch; 9634 /// The compare instruction for loop control 9635 MachineInstr *Comp; 9636 /// The number of the operand of the loop counter value in Comp 9637 unsigned CompCounterOprNum; 9638 /// The instruction that updates the loop counter value 9639 MachineInstr *Update; 9640 /// The number of the operand of the loop counter value in Update 9641 unsigned UpdateCounterOprNum; 9642 /// The initial value of the loop counter 9643 Register Init; 9644 /// True iff Update is a predecessor of Comp 9645 bool IsUpdatePriorComp; 9646 9647 /// The normalized condition used by createTripCountGreaterCondition() 9648 SmallVector<MachineOperand, 4> Cond; 9649 9650 public: 9651 AArch64PipelinerLoopInfo(MachineBasicBlock *LoopBB, MachineInstr *CondBranch, 9652 MachineInstr *Comp, unsigned CompCounterOprNum, 9653 MachineInstr *Update, unsigned UpdateCounterOprNum, 9654 Register Init, bool IsUpdatePriorComp, 9655 const SmallVectorImpl<MachineOperand> &Cond) 9656 : MF(Comp->getParent()->getParent()), 9657 TII(MF->getSubtarget().getInstrInfo()), 9658 TRI(MF->getSubtarget().getRegisterInfo()), MRI(MF->getRegInfo()), 9659 LoopBB(LoopBB), CondBranch(CondBranch), Comp(Comp), 9660 CompCounterOprNum(CompCounterOprNum), Update(Update), 9661 UpdateCounterOprNum(UpdateCounterOprNum), Init(Init), 9662 IsUpdatePriorComp(IsUpdatePriorComp), Cond(Cond.begin(), Cond.end()) {} 9663 9664 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { 9665 // Make the instructions for loop control be placed in stage 0. 9666 // The predecessors of Comp are considered by the caller. 9667 return MI == Comp; 9668 } 9669 9670 std::optional<bool> createTripCountGreaterCondition( 9671 int TC, MachineBasicBlock &MBB, 9672 SmallVectorImpl<MachineOperand> &CondParam) override { 9673 // A branch instruction will be inserted as "if (Cond) goto epilogue". 9674 // Cond is normalized for such use. 9675 // The predecessors of the branch are assumed to have already been inserted. 9676 CondParam = Cond; 9677 return {}; 9678 } 9679 9680 void createRemainingIterationsGreaterCondition( 9681 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, 9682 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) override; 9683 9684 void setPreheader(MachineBasicBlock *NewPreheader) override {} 9685 9686 void adjustTripCount(int TripCountAdjust) override {} 9687 9688 void disposed() override {} 9689 bool isMVEExpanderSupported() override { return true; } 9690 }; 9691 } // namespace 9692 9693 /// Clone an instruction from MI. The register of ReplaceOprNum-th operand 9694 /// is replaced by ReplaceReg. The output register is newly created. 9695 /// The other operands are unchanged from MI. 9696 static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, 9697 Register ReplaceReg, MachineBasicBlock &MBB, 9698 MachineBasicBlock::iterator InsertTo) { 9699 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 9700 const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); 9701 const TargetRegisterInfo *TRI = 9702 MBB.getParent()->getSubtarget().getRegisterInfo(); 9703 MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); 9704 Register Result = 0; 9705 for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { 9706 if (I == 0 && NewMI->getOperand(0).getReg().isVirtual()) { 9707 Result = MRI.createVirtualRegister( 9708 MRI.getRegClass(NewMI->getOperand(0).getReg())); 9709 NewMI->getOperand(I).setReg(Result); 9710 } else if (I == ReplaceOprNum) { 9711 MRI.constrainRegClass( 9712 ReplaceReg, 9713 TII->getRegClass(NewMI->getDesc(), I, TRI, *MBB.getParent())); 9714 NewMI->getOperand(I).setReg(ReplaceReg); 9715 } 9716 } 9717 MBB.insert(InsertTo, NewMI); 9718 return Result; 9719 } 9720 9721 void AArch64PipelinerLoopInfo::createRemainingIterationsGreaterCondition( 9722 int TC, MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &Cond, 9723 DenseMap<MachineInstr *, MachineInstr *> &LastStage0Insts) { 9724 // Create and accumulate conditions for next TC iterations. 9725 // Example: 9726 // SUBSXrr N, counter, implicit-def $nzcv # compare instruction for the last 9727 // # iteration of the kernel 9728 // 9729 // # insert the following instructions 9730 // cond = CSINCXr 0, 0, C, implicit $nzcv 9731 // counter = ADDXri counter, 1 # clone from this->Update 9732 // SUBSXrr n, counter, implicit-def $nzcv # clone from this->Comp 9733 // cond = CSINCXr cond, cond, C, implicit $nzcv 9734 // ... (repeat TC times) 9735 // SUBSXri cond, 0, implicit-def $nzcv 9736 9737 assert(CondBranch->getOpcode() == AArch64::Bcc); 9738 // CondCode to exit the loop 9739 AArch64CC::CondCode CC = 9740 (AArch64CC::CondCode)CondBranch->getOperand(0).getImm(); 9741 if (CondBranch->getOperand(1).getMBB() == LoopBB) 9742 CC = AArch64CC::getInvertedCondCode(CC); 9743 9744 // Accumulate conditions to exit the loop 9745 Register AccCond = AArch64::XZR; 9746 9747 // If CC holds, CurCond+1 is returned; otherwise CurCond is returned. 9748 auto AccumulateCond = [&](Register CurCond, 9749 AArch64CC::CondCode CC) -> Register { 9750 Register NewCond = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass); 9751 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::CSINCXr)) 9752 .addReg(NewCond, RegState::Define) 9753 .addReg(CurCond) 9754 .addReg(CurCond) 9755 .addImm(AArch64CC::getInvertedCondCode(CC)); 9756 return NewCond; 9757 }; 9758 9759 if (!LastStage0Insts.empty() && LastStage0Insts[Comp]->getParent() == &MBB) { 9760 // Update and Comp for I==0 are already exists in MBB 9761 // (MBB is an unrolled kernel) 9762 Register Counter; 9763 for (int I = 0; I <= TC; ++I) { 9764 Register NextCounter; 9765 if (I != 0) 9766 NextCounter = 9767 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); 9768 9769 AccCond = AccumulateCond(AccCond, CC); 9770 9771 if (I != TC) { 9772 if (I == 0) { 9773 if (Update != Comp && IsUpdatePriorComp) { 9774 Counter = 9775 LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); 9776 NextCounter = cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, 9777 MBB.end()); 9778 } else { 9779 // can use already calculated value 9780 NextCounter = LastStage0Insts[Update]->getOperand(0).getReg(); 9781 } 9782 } else if (Update != Comp) { 9783 NextCounter = 9784 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 9785 } 9786 } 9787 Counter = NextCounter; 9788 } 9789 } else { 9790 Register Counter; 9791 if (LastStage0Insts.empty()) { 9792 // use initial counter value (testing if the trip count is sufficient to 9793 // be executed by pipelined code) 9794 Counter = Init; 9795 if (IsUpdatePriorComp) 9796 Counter = 9797 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 9798 } else { 9799 // MBB is an epilogue block. LastStage0Insts[Comp] is in the kernel block. 9800 Counter = LastStage0Insts[Comp]->getOperand(CompCounterOprNum).getReg(); 9801 } 9802 9803 for (int I = 0; I <= TC; ++I) { 9804 Register NextCounter; 9805 NextCounter = 9806 cloneInstr(Comp, CompCounterOprNum, Counter, MBB, MBB.end()); 9807 AccCond = AccumulateCond(AccCond, CC); 9808 if (I != TC && Update != Comp) 9809 NextCounter = 9810 cloneInstr(Update, UpdateCounterOprNum, Counter, MBB, MBB.end()); 9811 Counter = NextCounter; 9812 } 9813 } 9814 9815 // If AccCond == 0, the remainder is greater than TC. 9816 BuildMI(MBB, MBB.end(), Comp->getDebugLoc(), TII->get(AArch64::SUBSXri)) 9817 .addReg(AArch64::XZR, RegState::Define | RegState::Dead) 9818 .addReg(AccCond) 9819 .addImm(0) 9820 .addImm(0); 9821 Cond.clear(); 9822 Cond.push_back(MachineOperand::CreateImm(AArch64CC::EQ)); 9823 } 9824 9825 static void extractPhiReg(const MachineInstr &Phi, const MachineBasicBlock *MBB, 9826 Register &RegMBB, Register &RegOther) { 9827 assert(Phi.getNumOperands() == 5); 9828 if (Phi.getOperand(2).getMBB() == MBB) { 9829 RegMBB = Phi.getOperand(1).getReg(); 9830 RegOther = Phi.getOperand(3).getReg(); 9831 } else { 9832 assert(Phi.getOperand(4).getMBB() == MBB); 9833 RegMBB = Phi.getOperand(3).getReg(); 9834 RegOther = Phi.getOperand(1).getReg(); 9835 } 9836 } 9837 9838 static bool isDefinedOutside(Register Reg, const MachineBasicBlock *BB) { 9839 if (!Reg.isVirtual()) 9840 return false; 9841 const MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 9842 return MRI.getVRegDef(Reg)->getParent() != BB; 9843 } 9844 9845 /// If Reg is an induction variable, return true and set some parameters 9846 static bool getIndVarInfo(Register Reg, const MachineBasicBlock *LoopBB, 9847 MachineInstr *&UpdateInst, 9848 unsigned &UpdateCounterOprNum, Register &InitReg, 9849 bool &IsUpdatePriorComp) { 9850 // Example: 9851 // 9852 // Preheader: 9853 // InitReg = ... 9854 // LoopBB: 9855 // Reg0 = PHI (InitReg, Preheader), (Reg1, LoopBB) 9856 // Reg = COPY Reg0 ; COPY is ignored. 9857 // Reg1 = ADD Reg, #1; UpdateInst. Incremented by a loop invariant value. 9858 // ; Reg is the value calculated in the previous 9859 // ; iteration, so IsUpdatePriorComp == false. 9860 9861 if (LoopBB->pred_size() != 2) 9862 return false; 9863 if (!Reg.isVirtual()) 9864 return false; 9865 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); 9866 UpdateInst = nullptr; 9867 UpdateCounterOprNum = 0; 9868 InitReg = 0; 9869 IsUpdatePriorComp = true; 9870 Register CurReg = Reg; 9871 while (true) { 9872 MachineInstr *Def = MRI.getVRegDef(CurReg); 9873 if (Def->getParent() != LoopBB) 9874 return false; 9875 if (Def->isCopy()) { 9876 // Ignore copy instructions unless they contain subregisters 9877 if (Def->getOperand(0).getSubReg() || Def->getOperand(1).getSubReg()) 9878 return false; 9879 CurReg = Def->getOperand(1).getReg(); 9880 } else if (Def->isPHI()) { 9881 if (InitReg != 0) 9882 return false; 9883 if (!UpdateInst) 9884 IsUpdatePriorComp = false; 9885 extractPhiReg(*Def, LoopBB, CurReg, InitReg); 9886 } else { 9887 if (UpdateInst) 9888 return false; 9889 switch (Def->getOpcode()) { 9890 case AArch64::ADDSXri: 9891 case AArch64::ADDSWri: 9892 case AArch64::SUBSXri: 9893 case AArch64::SUBSWri: 9894 case AArch64::ADDXri: 9895 case AArch64::ADDWri: 9896 case AArch64::SUBXri: 9897 case AArch64::SUBWri: 9898 UpdateInst = Def; 9899 UpdateCounterOprNum = 1; 9900 break; 9901 case AArch64::ADDSXrr: 9902 case AArch64::ADDSWrr: 9903 case AArch64::SUBSXrr: 9904 case AArch64::SUBSWrr: 9905 case AArch64::ADDXrr: 9906 case AArch64::ADDWrr: 9907 case AArch64::SUBXrr: 9908 case AArch64::SUBWrr: 9909 UpdateInst = Def; 9910 if (isDefinedOutside(Def->getOperand(2).getReg(), LoopBB)) 9911 UpdateCounterOprNum = 1; 9912 else if (isDefinedOutside(Def->getOperand(1).getReg(), LoopBB)) 9913 UpdateCounterOprNum = 2; 9914 else 9915 return false; 9916 break; 9917 default: 9918 return false; 9919 } 9920 CurReg = Def->getOperand(UpdateCounterOprNum).getReg(); 9921 } 9922 9923 if (!CurReg.isVirtual()) 9924 return false; 9925 if (Reg == CurReg) 9926 break; 9927 } 9928 9929 if (!UpdateInst) 9930 return false; 9931 9932 return true; 9933 } 9934 9935 std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> 9936 AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { 9937 // Accept loops that meet the following conditions 9938 // * The conditional branch is BCC 9939 // * The compare instruction is ADDS/SUBS/WHILEXX 9940 // * One operand of the compare is an induction variable and the other is a 9941 // loop invariant value 9942 // * The induction variable is incremented/decremented by a single instruction 9943 // * Does not contain CALL or instructions which have unmodeled side effects 9944 9945 for (MachineInstr &MI : *LoopBB) 9946 if (MI.isCall() || MI.hasUnmodeledSideEffects()) 9947 // This instruction may use NZCV, which interferes with the instruction to 9948 // be inserted for loop control. 9949 return nullptr; 9950 9951 MachineBasicBlock *TBB = nullptr, *FBB = nullptr; 9952 SmallVector<MachineOperand, 4> Cond; 9953 if (analyzeBranch(*LoopBB, TBB, FBB, Cond)) 9954 return nullptr; 9955 9956 // Infinite loops are not supported 9957 if (TBB == LoopBB && FBB == LoopBB) 9958 return nullptr; 9959 9960 // Must be conditional branch 9961 if (TBB != LoopBB && FBB == nullptr) 9962 return nullptr; 9963 9964 assert((TBB == LoopBB || FBB == LoopBB) && 9965 "The Loop must be a single-basic-block loop"); 9966 9967 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); 9968 const TargetRegisterInfo &TRI = getRegisterInfo(); 9969 9970 if (CondBranch->getOpcode() != AArch64::Bcc) 9971 return nullptr; 9972 9973 // Normalization for createTripCountGreaterCondition() 9974 if (TBB == LoopBB) 9975 reverseBranchCondition(Cond); 9976 9977 MachineInstr *Comp = nullptr; 9978 unsigned CompCounterOprNum = 0; 9979 for (MachineInstr &MI : reverse(*LoopBB)) { 9980 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { 9981 // Guarantee that the compare is SUBS/ADDS/WHILEXX and that one of the 9982 // operands is a loop invariant value 9983 9984 switch (MI.getOpcode()) { 9985 case AArch64::SUBSXri: 9986 case AArch64::SUBSWri: 9987 case AArch64::ADDSXri: 9988 case AArch64::ADDSWri: 9989 Comp = &MI; 9990 CompCounterOprNum = 1; 9991 break; 9992 case AArch64::ADDSWrr: 9993 case AArch64::ADDSXrr: 9994 case AArch64::SUBSWrr: 9995 case AArch64::SUBSXrr: 9996 Comp = &MI; 9997 break; 9998 default: 9999 if (isWhileOpcode(MI.getOpcode())) { 10000 Comp = &MI; 10001 break; 10002 } 10003 return nullptr; 10004 } 10005 10006 if (CompCounterOprNum == 0) { 10007 if (isDefinedOutside(Comp->getOperand(1).getReg(), LoopBB)) 10008 CompCounterOprNum = 2; 10009 else if (isDefinedOutside(Comp->getOperand(2).getReg(), LoopBB)) 10010 CompCounterOprNum = 1; 10011 else 10012 return nullptr; 10013 } 10014 break; 10015 } 10016 } 10017 if (!Comp) 10018 return nullptr; 10019 10020 MachineInstr *Update = nullptr; 10021 Register Init; 10022 bool IsUpdatePriorComp; 10023 unsigned UpdateCounterOprNum; 10024 if (!getIndVarInfo(Comp->getOperand(CompCounterOprNum).getReg(), LoopBB, 10025 Update, UpdateCounterOprNum, Init, IsUpdatePriorComp)) 10026 return nullptr; 10027 10028 return std::make_unique<AArch64PipelinerLoopInfo>( 10029 LoopBB, CondBranch, Comp, CompCounterOprNum, Update, UpdateCounterOprNum, 10030 Init, IsUpdatePriorComp, Cond); 10031 } 10032 10033 #define GET_INSTRINFO_HELPERS 10034 #define GET_INSTRMAP_INFO 10035 #include "AArch64GenInstrInfo.inc" 10036