1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64ExpandImm.h" 15 #include "AArch64FrameLowering.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PointerAuth.h" 18 #include "AArch64Subtarget.h" 19 #include "MCTargetDesc/AArch64AddressingModes.h" 20 #include "Utils/AArch64BaseInfo.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/CodeGen/LivePhysRegs.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineCombinerPattern.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineMemOperand.h" 32 #include "llvm/CodeGen/MachineModuleInfo.h" 33 #include "llvm/CodeGen/MachineOperand.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/RegisterScavenging.h" 36 #include "llvm/CodeGen/StackMaps.h" 37 #include "llvm/CodeGen/TargetRegisterInfo.h" 38 #include "llvm/CodeGen/TargetSubtargetInfo.h" 39 #include "llvm/IR/DebugInfoMetadata.h" 40 #include "llvm/IR/DebugLoc.h" 41 #include "llvm/IR/GlobalValue.h" 42 #include "llvm/MC/MCAsmInfo.h" 43 #include "llvm/MC/MCInst.h" 44 #include "llvm/MC/MCInstBuilder.h" 45 #include "llvm/MC/MCInstrDesc.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CodeGen.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/LEB128.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <cassert> 55 #include <cstdint> 56 #include <iterator> 57 #include <utility> 58 59 using namespace llvm; 60 61 #define GET_INSTRINFO_CTOR_DTOR 62 #include "AArch64GenInstrInfo.inc" 63 64 static cl::opt<unsigned> TBZDisplacementBits( 65 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 66 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 67 68 static cl::opt<unsigned> CBZDisplacementBits( 69 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 70 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 71 72 static cl::opt<unsigned> 73 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 74 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 75 76 static cl::opt<unsigned> 77 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), 78 cl::desc("Restrict range of B instructions (DEBUG)")); 79 80 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 81 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 82 AArch64::CATCHRET), 83 RI(STI.getTargetTriple()), Subtarget(STI) {} 84 85 /// GetInstSize - Return the number of bytes of code the specified 86 /// instruction may be. This returns the maximum number of bytes. 87 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 88 const MachineBasicBlock &MBB = *MI.getParent(); 89 const MachineFunction *MF = MBB.getParent(); 90 const Function &F = MF->getFunction(); 91 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 92 93 { 94 auto Op = MI.getOpcode(); 95 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 96 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 97 } 98 99 // Meta-instructions emit no code. 100 if (MI.isMetaInstruction()) 101 return 0; 102 103 // FIXME: We currently only handle pseudoinstructions that don't get expanded 104 // before the assembly printer. 105 unsigned NumBytes = 0; 106 const MCInstrDesc &Desc = MI.getDesc(); 107 108 // Size should be preferably set in 109 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 110 // Specific cases handle instructions of variable sizes 111 switch (Desc.getOpcode()) { 112 default: 113 if (Desc.getSize()) 114 return Desc.getSize(); 115 116 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 117 // with fixed constant size but not specified in .td file) is a normal 118 // 4-byte insn. 119 NumBytes = 4; 120 break; 121 case TargetOpcode::STACKMAP: 122 // The upper bound for a stackmap intrinsic is the full length of its shadow 123 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 125 break; 126 case TargetOpcode::PATCHPOINT: 127 // The size of the patchpoint intrinsic is the number of bytes requested 128 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 129 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 130 break; 131 case TargetOpcode::STATEPOINT: 132 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 133 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 134 // No patch bytes means a normal call inst is emitted 135 if (NumBytes == 0) 136 NumBytes = 4; 137 break; 138 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 139 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER 140 // instructions are expanded to the specified number of NOPs. Otherwise, 141 // they are expanded to 36-byte XRay sleds. 142 NumBytes = 143 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4; 144 break; 145 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 146 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: 147 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 148 NumBytes = 36; 149 break; 150 case TargetOpcode::PATCHABLE_EVENT_CALL: 151 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment). 152 NumBytes = 24; 153 break; 154 155 case AArch64::SPACE: 156 NumBytes = MI.getOperand(1).getImm(); 157 break; 158 case TargetOpcode::BUNDLE: 159 NumBytes = getInstBundleLength(MI); 160 break; 161 } 162 163 return NumBytes; 164 } 165 166 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 167 unsigned Size = 0; 168 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 169 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 170 while (++I != E && I->isInsideBundle()) { 171 assert(!I->isBundle() && "No nested bundle!"); 172 Size += getInstSizeInBytes(*I); 173 } 174 return Size; 175 } 176 177 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 178 SmallVectorImpl<MachineOperand> &Cond) { 179 // Block ends with fall-through condbranch. 180 switch (LastInst->getOpcode()) { 181 default: 182 llvm_unreachable("Unknown branch instruction?"); 183 case AArch64::Bcc: 184 Target = LastInst->getOperand(1).getMBB(); 185 Cond.push_back(LastInst->getOperand(0)); 186 break; 187 case AArch64::CBZW: 188 case AArch64::CBZX: 189 case AArch64::CBNZW: 190 case AArch64::CBNZX: 191 Target = LastInst->getOperand(1).getMBB(); 192 Cond.push_back(MachineOperand::CreateImm(-1)); 193 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 194 Cond.push_back(LastInst->getOperand(0)); 195 break; 196 case AArch64::TBZW: 197 case AArch64::TBZX: 198 case AArch64::TBNZW: 199 case AArch64::TBNZX: 200 Target = LastInst->getOperand(2).getMBB(); 201 Cond.push_back(MachineOperand::CreateImm(-1)); 202 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 203 Cond.push_back(LastInst->getOperand(0)); 204 Cond.push_back(LastInst->getOperand(1)); 205 } 206 } 207 208 static unsigned getBranchDisplacementBits(unsigned Opc) { 209 switch (Opc) { 210 default: 211 llvm_unreachable("unexpected opcode!"); 212 case AArch64::B: 213 return BDisplacementBits; 214 case AArch64::TBNZW: 215 case AArch64::TBZW: 216 case AArch64::TBNZX: 217 case AArch64::TBZX: 218 return TBZDisplacementBits; 219 case AArch64::CBNZW: 220 case AArch64::CBZW: 221 case AArch64::CBNZX: 222 case AArch64::CBZX: 223 return CBZDisplacementBits; 224 case AArch64::Bcc: 225 return BCCDisplacementBits; 226 } 227 } 228 229 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 230 int64_t BrOffset) const { 231 unsigned Bits = getBranchDisplacementBits(BranchOp); 232 assert(Bits >= 3 && "max branch displacement must be enough to jump" 233 "over conditional branch expansion"); 234 return isIntN(Bits, BrOffset / 4); 235 } 236 237 MachineBasicBlock * 238 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 239 switch (MI.getOpcode()) { 240 default: 241 llvm_unreachable("unexpected opcode!"); 242 case AArch64::B: 243 return MI.getOperand(0).getMBB(); 244 case AArch64::TBZW: 245 case AArch64::TBNZW: 246 case AArch64::TBZX: 247 case AArch64::TBNZX: 248 return MI.getOperand(2).getMBB(); 249 case AArch64::CBZW: 250 case AArch64::CBNZW: 251 case AArch64::CBZX: 252 case AArch64::CBNZX: 253 case AArch64::Bcc: 254 return MI.getOperand(1).getMBB(); 255 } 256 } 257 258 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 259 MachineBasicBlock &NewDestBB, 260 MachineBasicBlock &RestoreBB, 261 const DebugLoc &DL, 262 int64_t BrOffset, 263 RegScavenger *RS) const { 264 assert(RS && "RegScavenger required for long branching"); 265 assert(MBB.empty() && 266 "new block should be inserted for expanding unconditional branch"); 267 assert(MBB.pred_size() == 1); 268 assert(RestoreBB.empty() && 269 "restore block should be inserted for restoring clobbered registers"); 270 271 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) { 272 // Offsets outside of the signed 33-bit range are not supported for ADRP + 273 // ADD. 274 if (!isInt<33>(BrOffset)) 275 report_fatal_error( 276 "Branch offsets outside of the signed 33-bit range not supported"); 277 278 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg) 279 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE); 280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg) 281 .addReg(Reg) 282 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC) 283 .addImm(0); 284 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg); 285 }; 286 287 RS->enterBasicBlockEnd(MBB); 288 // If X16 is unused, we can rely on the linker to insert a range extension 289 // thunk if NewDestBB is out of range of a single B instruction. 290 constexpr Register Reg = AArch64::X16; 291 if (!RS->isRegUsed(Reg)) { 292 insertUnconditionalBranch(MBB, &NewDestBB, DL); 293 RS->setRegUsed(Reg); 294 return; 295 } 296 297 // If there's a free register and it's worth inflating the code size, 298 // manually insert the indirect branch. 299 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass); 300 if (Scavenged != AArch64::NoRegister && 301 MBB.getSectionID() == MBBSectionID::ColdSectionID) { 302 buildIndirectBranch(Scavenged, NewDestBB); 303 RS->setRegUsed(Scavenged); 304 return; 305 } 306 307 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible 308 // with red zones. 309 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>(); 310 if (!AFI || AFI->hasRedZone().value_or(true)) 311 report_fatal_error( 312 "Unable to insert indirect branch inside function that has red zone"); 313 314 // Otherwise, spill X16 and defer range extension to the linker. 315 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre)) 316 .addReg(AArch64::SP, RegState::Define) 317 .addReg(Reg) 318 .addReg(AArch64::SP) 319 .addImm(-16); 320 321 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB); 322 323 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost)) 324 .addReg(AArch64::SP, RegState::Define) 325 .addReg(Reg, RegState::Define) 326 .addReg(AArch64::SP) 327 .addImm(16); 328 } 329 330 // Branch analysis. 331 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 332 MachineBasicBlock *&TBB, 333 MachineBasicBlock *&FBB, 334 SmallVectorImpl<MachineOperand> &Cond, 335 bool AllowModify) const { 336 // If the block has no terminators, it just falls into the block after it. 337 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 338 if (I == MBB.end()) 339 return false; 340 341 // Skip over SpeculationBarrierEndBB terminators 342 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 343 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 344 --I; 345 } 346 347 if (!isUnpredicatedTerminator(*I)) 348 return false; 349 350 // Get the last instruction in the block. 351 MachineInstr *LastInst = &*I; 352 353 // If there is only one terminator instruction, process it. 354 unsigned LastOpc = LastInst->getOpcode(); 355 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 356 if (isUncondBranchOpcode(LastOpc)) { 357 TBB = LastInst->getOperand(0).getMBB(); 358 return false; 359 } 360 if (isCondBranchOpcode(LastOpc)) { 361 // Block ends with fall-through condbranch. 362 parseCondBranch(LastInst, TBB, Cond); 363 return false; 364 } 365 return true; // Can't handle indirect branch. 366 } 367 368 // Get the instruction before it if it is a terminator. 369 MachineInstr *SecondLastInst = &*I; 370 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 371 372 // If AllowModify is true and the block ends with two or more unconditional 373 // branches, delete all but the first unconditional branch. 374 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 375 while (isUncondBranchOpcode(SecondLastOpc)) { 376 LastInst->eraseFromParent(); 377 LastInst = SecondLastInst; 378 LastOpc = LastInst->getOpcode(); 379 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 380 // Return now the only terminator is an unconditional branch. 381 TBB = LastInst->getOperand(0).getMBB(); 382 return false; 383 } 384 SecondLastInst = &*I; 385 SecondLastOpc = SecondLastInst->getOpcode(); 386 } 387 } 388 389 // If we're allowed to modify and the block ends in a unconditional branch 390 // which could simply fallthrough, remove the branch. (Note: This case only 391 // matters when we can't understand the whole sequence, otherwise it's also 392 // handled by BranchFolding.cpp.) 393 if (AllowModify && isUncondBranchOpcode(LastOpc) && 394 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 395 LastInst->eraseFromParent(); 396 LastInst = SecondLastInst; 397 LastOpc = LastInst->getOpcode(); 398 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 399 assert(!isUncondBranchOpcode(LastOpc) && 400 "unreachable unconditional branches removed above"); 401 402 if (isCondBranchOpcode(LastOpc)) { 403 // Block ends with fall-through condbranch. 404 parseCondBranch(LastInst, TBB, Cond); 405 return false; 406 } 407 return true; // Can't handle indirect branch. 408 } 409 SecondLastInst = &*I; 410 SecondLastOpc = SecondLastInst->getOpcode(); 411 } 412 413 // If there are three terminators, we don't know what sort of block this is. 414 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 415 return true; 416 417 // If the block ends with a B and a Bcc, handle it. 418 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 419 parseCondBranch(SecondLastInst, TBB, Cond); 420 FBB = LastInst->getOperand(0).getMBB(); 421 return false; 422 } 423 424 // If the block ends with two unconditional branches, handle it. The second 425 // one is not executed, so remove it. 426 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 427 TBB = SecondLastInst->getOperand(0).getMBB(); 428 I = LastInst; 429 if (AllowModify) 430 I->eraseFromParent(); 431 return false; 432 } 433 434 // ...likewise if it ends with an indirect branch followed by an unconditional 435 // branch. 436 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 437 I = LastInst; 438 if (AllowModify) 439 I->eraseFromParent(); 440 return true; 441 } 442 443 // Otherwise, can't handle this. 444 return true; 445 } 446 447 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 448 MachineBranchPredicate &MBP, 449 bool AllowModify) const { 450 // For the moment, handle only a block which ends with a cb(n)zx followed by 451 // a fallthrough. Why this? Because it is a common form. 452 // TODO: Should we handle b.cc? 453 454 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 455 if (I == MBB.end()) 456 return true; 457 458 // Skip over SpeculationBarrierEndBB terminators 459 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 460 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 461 --I; 462 } 463 464 if (!isUnpredicatedTerminator(*I)) 465 return true; 466 467 // Get the last instruction in the block. 468 MachineInstr *LastInst = &*I; 469 unsigned LastOpc = LastInst->getOpcode(); 470 if (!isCondBranchOpcode(LastOpc)) 471 return true; 472 473 switch (LastOpc) { 474 default: 475 return true; 476 case AArch64::CBZW: 477 case AArch64::CBZX: 478 case AArch64::CBNZW: 479 case AArch64::CBNZX: 480 break; 481 }; 482 483 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 484 assert(MBP.TrueDest && "expected!"); 485 MBP.FalseDest = MBB.getNextNode(); 486 487 MBP.ConditionDef = nullptr; 488 MBP.SingleUseCondition = false; 489 490 MBP.LHS = LastInst->getOperand(0); 491 MBP.RHS = MachineOperand::CreateImm(0); 492 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 493 : MachineBranchPredicate::PRED_EQ; 494 return false; 495 } 496 497 bool AArch64InstrInfo::reverseBranchCondition( 498 SmallVectorImpl<MachineOperand> &Cond) const { 499 if (Cond[0].getImm() != -1) { 500 // Regular Bcc 501 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 502 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 503 } else { 504 // Folded compare-and-branch 505 switch (Cond[1].getImm()) { 506 default: 507 llvm_unreachable("Unknown conditional branch!"); 508 case AArch64::CBZW: 509 Cond[1].setImm(AArch64::CBNZW); 510 break; 511 case AArch64::CBNZW: 512 Cond[1].setImm(AArch64::CBZW); 513 break; 514 case AArch64::CBZX: 515 Cond[1].setImm(AArch64::CBNZX); 516 break; 517 case AArch64::CBNZX: 518 Cond[1].setImm(AArch64::CBZX); 519 break; 520 case AArch64::TBZW: 521 Cond[1].setImm(AArch64::TBNZW); 522 break; 523 case AArch64::TBNZW: 524 Cond[1].setImm(AArch64::TBZW); 525 break; 526 case AArch64::TBZX: 527 Cond[1].setImm(AArch64::TBNZX); 528 break; 529 case AArch64::TBNZX: 530 Cond[1].setImm(AArch64::TBZX); 531 break; 532 } 533 } 534 535 return false; 536 } 537 538 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 539 int *BytesRemoved) const { 540 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 541 if (I == MBB.end()) 542 return 0; 543 544 if (!isUncondBranchOpcode(I->getOpcode()) && 545 !isCondBranchOpcode(I->getOpcode())) 546 return 0; 547 548 // Remove the branch. 549 I->eraseFromParent(); 550 551 I = MBB.end(); 552 553 if (I == MBB.begin()) { 554 if (BytesRemoved) 555 *BytesRemoved = 4; 556 return 1; 557 } 558 --I; 559 if (!isCondBranchOpcode(I->getOpcode())) { 560 if (BytesRemoved) 561 *BytesRemoved = 4; 562 return 1; 563 } 564 565 // Remove the branch. 566 I->eraseFromParent(); 567 if (BytesRemoved) 568 *BytesRemoved = 8; 569 570 return 2; 571 } 572 573 void AArch64InstrInfo::instantiateCondBranch( 574 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 575 ArrayRef<MachineOperand> Cond) const { 576 if (Cond[0].getImm() != -1) { 577 // Regular Bcc 578 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 579 } else { 580 // Folded compare-and-branch 581 // Note that we use addOperand instead of addReg to keep the flags. 582 const MachineInstrBuilder MIB = 583 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 584 if (Cond.size() > 3) 585 MIB.addImm(Cond[3].getImm()); 586 MIB.addMBB(TBB); 587 } 588 } 589 590 unsigned AArch64InstrInfo::insertBranch( 591 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 592 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 593 // Shouldn't be a fall through. 594 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 595 596 if (!FBB) { 597 if (Cond.empty()) // Unconditional branch? 598 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 599 else 600 instantiateCondBranch(MBB, DL, TBB, Cond); 601 602 if (BytesAdded) 603 *BytesAdded = 4; 604 605 return 1; 606 } 607 608 // Two-way conditional branch. 609 instantiateCondBranch(MBB, DL, TBB, Cond); 610 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 611 612 if (BytesAdded) 613 *BytesAdded = 8; 614 615 return 2; 616 } 617 618 // Find the original register that VReg is copied from. 619 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 620 while (Register::isVirtualRegister(VReg)) { 621 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 622 if (!DefMI->isFullCopy()) 623 return VReg; 624 VReg = DefMI->getOperand(1).getReg(); 625 } 626 return VReg; 627 } 628 629 // Determine if VReg is defined by an instruction that can be folded into a 630 // csel instruction. If so, return the folded opcode, and the replacement 631 // register. 632 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 633 unsigned *NewVReg = nullptr) { 634 VReg = removeCopies(MRI, VReg); 635 if (!Register::isVirtualRegister(VReg)) 636 return 0; 637 638 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 639 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 640 unsigned Opc = 0; 641 unsigned SrcOpNum = 0; 642 switch (DefMI->getOpcode()) { 643 case AArch64::ADDSXri: 644 case AArch64::ADDSWri: 645 // if NZCV is used, do not fold. 646 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 647 return 0; 648 // fall-through to ADDXri and ADDWri. 649 [[fallthrough]]; 650 case AArch64::ADDXri: 651 case AArch64::ADDWri: 652 // add x, 1 -> csinc. 653 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 654 DefMI->getOperand(3).getImm() != 0) 655 return 0; 656 SrcOpNum = 1; 657 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 658 break; 659 660 case AArch64::ORNXrr: 661 case AArch64::ORNWrr: { 662 // not x -> csinv, represented as orn dst, xzr, src. 663 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 664 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 665 return 0; 666 SrcOpNum = 2; 667 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 668 break; 669 } 670 671 case AArch64::SUBSXrr: 672 case AArch64::SUBSWrr: 673 // if NZCV is used, do not fold. 674 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 675 return 0; 676 // fall-through to SUBXrr and SUBWrr. 677 [[fallthrough]]; 678 case AArch64::SUBXrr: 679 case AArch64::SUBWrr: { 680 // neg x -> csneg, represented as sub dst, xzr, src. 681 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 682 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 683 return 0; 684 SrcOpNum = 2; 685 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 686 break; 687 } 688 default: 689 return 0; 690 } 691 assert(Opc && SrcOpNum && "Missing parameters"); 692 693 if (NewVReg) 694 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 695 return Opc; 696 } 697 698 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 699 ArrayRef<MachineOperand> Cond, 700 Register DstReg, Register TrueReg, 701 Register FalseReg, int &CondCycles, 702 int &TrueCycles, 703 int &FalseCycles) const { 704 // Check register classes. 705 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 706 const TargetRegisterClass *RC = 707 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 708 if (!RC) 709 return false; 710 711 // Also need to check the dest regclass, in case we're trying to optimize 712 // something like: 713 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 714 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 715 return false; 716 717 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 718 unsigned ExtraCondLat = Cond.size() != 1; 719 720 // GPRs are handled by csel. 721 // FIXME: Fold in x+1, -x, and ~x when applicable. 722 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 723 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 724 // Single-cycle csel, csinc, csinv, and csneg. 725 CondCycles = 1 + ExtraCondLat; 726 TrueCycles = FalseCycles = 1; 727 if (canFoldIntoCSel(MRI, TrueReg)) 728 TrueCycles = 0; 729 else if (canFoldIntoCSel(MRI, FalseReg)) 730 FalseCycles = 0; 731 return true; 732 } 733 734 // Scalar floating point is handled by fcsel. 735 // FIXME: Form fabs, fmin, and fmax when applicable. 736 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 737 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 738 CondCycles = 5 + ExtraCondLat; 739 TrueCycles = FalseCycles = 2; 740 return true; 741 } 742 743 // Can't do vectors. 744 return false; 745 } 746 747 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 748 MachineBasicBlock::iterator I, 749 const DebugLoc &DL, Register DstReg, 750 ArrayRef<MachineOperand> Cond, 751 Register TrueReg, Register FalseReg) const { 752 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 753 754 // Parse the condition code, see parseCondBranch() above. 755 AArch64CC::CondCode CC; 756 switch (Cond.size()) { 757 default: 758 llvm_unreachable("Unknown condition opcode in Cond"); 759 case 1: // b.cc 760 CC = AArch64CC::CondCode(Cond[0].getImm()); 761 break; 762 case 3: { // cbz/cbnz 763 // We must insert a compare against 0. 764 bool Is64Bit; 765 switch (Cond[1].getImm()) { 766 default: 767 llvm_unreachable("Unknown branch opcode in Cond"); 768 case AArch64::CBZW: 769 Is64Bit = false; 770 CC = AArch64CC::EQ; 771 break; 772 case AArch64::CBZX: 773 Is64Bit = true; 774 CC = AArch64CC::EQ; 775 break; 776 case AArch64::CBNZW: 777 Is64Bit = false; 778 CC = AArch64CC::NE; 779 break; 780 case AArch64::CBNZX: 781 Is64Bit = true; 782 CC = AArch64CC::NE; 783 break; 784 } 785 Register SrcReg = Cond[2].getReg(); 786 if (Is64Bit) { 787 // cmp reg, #0 is actually subs xzr, reg, #0. 788 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 789 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 790 .addReg(SrcReg) 791 .addImm(0) 792 .addImm(0); 793 } else { 794 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 795 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 796 .addReg(SrcReg) 797 .addImm(0) 798 .addImm(0); 799 } 800 break; 801 } 802 case 4: { // tbz/tbnz 803 // We must insert a tst instruction. 804 switch (Cond[1].getImm()) { 805 default: 806 llvm_unreachable("Unknown branch opcode in Cond"); 807 case AArch64::TBZW: 808 case AArch64::TBZX: 809 CC = AArch64CC::EQ; 810 break; 811 case AArch64::TBNZW: 812 case AArch64::TBNZX: 813 CC = AArch64CC::NE; 814 break; 815 } 816 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 817 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 818 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 819 .addReg(Cond[2].getReg()) 820 .addImm( 821 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 822 else 823 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 824 .addReg(Cond[2].getReg()) 825 .addImm( 826 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 827 break; 828 } 829 } 830 831 unsigned Opc = 0; 832 const TargetRegisterClass *RC = nullptr; 833 bool TryFold = false; 834 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 835 RC = &AArch64::GPR64RegClass; 836 Opc = AArch64::CSELXr; 837 TryFold = true; 838 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 839 RC = &AArch64::GPR32RegClass; 840 Opc = AArch64::CSELWr; 841 TryFold = true; 842 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 843 RC = &AArch64::FPR64RegClass; 844 Opc = AArch64::FCSELDrrr; 845 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 846 RC = &AArch64::FPR32RegClass; 847 Opc = AArch64::FCSELSrrr; 848 } 849 assert(RC && "Unsupported regclass"); 850 851 // Try folding simple instructions into the csel. 852 if (TryFold) { 853 unsigned NewVReg = 0; 854 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 855 if (FoldedOpc) { 856 // The folded opcodes csinc, csinc and csneg apply the operation to 857 // FalseReg, so we need to invert the condition. 858 CC = AArch64CC::getInvertedCondCode(CC); 859 TrueReg = FalseReg; 860 } else 861 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 862 863 // Fold the operation. Leave any dead instructions for DCE to clean up. 864 if (FoldedOpc) { 865 FalseReg = NewVReg; 866 Opc = FoldedOpc; 867 // The extends the live range of NewVReg. 868 MRI.clearKillFlags(NewVReg); 869 } 870 } 871 872 // Pull all virtual register into the appropriate class. 873 MRI.constrainRegClass(TrueReg, RC); 874 MRI.constrainRegClass(FalseReg, RC); 875 876 // Insert the csel. 877 BuildMI(MBB, I, DL, get(Opc), DstReg) 878 .addReg(TrueReg) 879 .addReg(FalseReg) 880 .addImm(CC); 881 } 882 883 // Return true if Imm can be loaded into a register by a "cheap" sequence of 884 // instructions. For now, "cheap" means at most two instructions. 885 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { 886 if (BitSize == 32) 887 return true; 888 889 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed"); 890 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm()); 891 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is; 892 AArch64_IMM::expandMOVImm(Imm, BitSize, Is); 893 894 return Is.size() <= 2; 895 } 896 897 // FIXME: this implementation should be micro-architecture dependent, so a 898 // micro-architecture target hook should be introduced here in future. 899 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 900 if (Subtarget.hasExynosCheapAsMoveHandling()) { 901 if (isExynosCheapAsMove(MI)) 902 return true; 903 return MI.isAsCheapAsAMove(); 904 } 905 906 switch (MI.getOpcode()) { 907 default: 908 return MI.isAsCheapAsAMove(); 909 910 case AArch64::ADDWrs: 911 case AArch64::ADDXrs: 912 case AArch64::SUBWrs: 913 case AArch64::SUBXrs: 914 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4; 915 916 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 917 // ORRXri, it is as cheap as MOV. 918 // Likewise if it can be expanded to MOVZ/MOVN/MOVK. 919 case AArch64::MOVi32imm: 920 return isCheapImmediate(MI, 32); 921 case AArch64::MOVi64imm: 922 return isCheapImmediate(MI, 64); 923 } 924 } 925 926 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 927 switch (MI.getOpcode()) { 928 default: 929 return false; 930 931 case AArch64::ADDWrs: 932 case AArch64::ADDXrs: 933 case AArch64::ADDSWrs: 934 case AArch64::ADDSXrs: { 935 unsigned Imm = MI.getOperand(3).getImm(); 936 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 937 if (ShiftVal == 0) 938 return true; 939 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 940 } 941 942 case AArch64::ADDWrx: 943 case AArch64::ADDXrx: 944 case AArch64::ADDXrx64: 945 case AArch64::ADDSWrx: 946 case AArch64::ADDSXrx: 947 case AArch64::ADDSXrx64: { 948 unsigned Imm = MI.getOperand(3).getImm(); 949 switch (AArch64_AM::getArithExtendType(Imm)) { 950 default: 951 return false; 952 case AArch64_AM::UXTB: 953 case AArch64_AM::UXTH: 954 case AArch64_AM::UXTW: 955 case AArch64_AM::UXTX: 956 return AArch64_AM::getArithShiftValue(Imm) <= 4; 957 } 958 } 959 960 case AArch64::SUBWrs: 961 case AArch64::SUBSWrs: { 962 unsigned Imm = MI.getOperand(3).getImm(); 963 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 964 return ShiftVal == 0 || 965 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 966 } 967 968 case AArch64::SUBXrs: 969 case AArch64::SUBSXrs: { 970 unsigned Imm = MI.getOperand(3).getImm(); 971 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 972 return ShiftVal == 0 || 973 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 974 } 975 976 case AArch64::SUBWrx: 977 case AArch64::SUBXrx: 978 case AArch64::SUBXrx64: 979 case AArch64::SUBSWrx: 980 case AArch64::SUBSXrx: 981 case AArch64::SUBSXrx64: { 982 unsigned Imm = MI.getOperand(3).getImm(); 983 switch (AArch64_AM::getArithExtendType(Imm)) { 984 default: 985 return false; 986 case AArch64_AM::UXTB: 987 case AArch64_AM::UXTH: 988 case AArch64_AM::UXTW: 989 case AArch64_AM::UXTX: 990 return AArch64_AM::getArithShiftValue(Imm) == 0; 991 } 992 } 993 994 case AArch64::LDRBBroW: 995 case AArch64::LDRBBroX: 996 case AArch64::LDRBroW: 997 case AArch64::LDRBroX: 998 case AArch64::LDRDroW: 999 case AArch64::LDRDroX: 1000 case AArch64::LDRHHroW: 1001 case AArch64::LDRHHroX: 1002 case AArch64::LDRHroW: 1003 case AArch64::LDRHroX: 1004 case AArch64::LDRQroW: 1005 case AArch64::LDRQroX: 1006 case AArch64::LDRSBWroW: 1007 case AArch64::LDRSBWroX: 1008 case AArch64::LDRSBXroW: 1009 case AArch64::LDRSBXroX: 1010 case AArch64::LDRSHWroW: 1011 case AArch64::LDRSHWroX: 1012 case AArch64::LDRSHXroW: 1013 case AArch64::LDRSHXroX: 1014 case AArch64::LDRSWroW: 1015 case AArch64::LDRSWroX: 1016 case AArch64::LDRSroW: 1017 case AArch64::LDRSroX: 1018 case AArch64::LDRWroW: 1019 case AArch64::LDRWroX: 1020 case AArch64::LDRXroW: 1021 case AArch64::LDRXroX: 1022 case AArch64::PRFMroW: 1023 case AArch64::PRFMroX: 1024 case AArch64::STRBBroW: 1025 case AArch64::STRBBroX: 1026 case AArch64::STRBroW: 1027 case AArch64::STRBroX: 1028 case AArch64::STRDroW: 1029 case AArch64::STRDroX: 1030 case AArch64::STRHHroW: 1031 case AArch64::STRHHroX: 1032 case AArch64::STRHroW: 1033 case AArch64::STRHroX: 1034 case AArch64::STRQroW: 1035 case AArch64::STRQroX: 1036 case AArch64::STRSroW: 1037 case AArch64::STRSroX: 1038 case AArch64::STRWroW: 1039 case AArch64::STRWroX: 1040 case AArch64::STRXroW: 1041 case AArch64::STRXroX: { 1042 unsigned IsSigned = MI.getOperand(3).getImm(); 1043 return !IsSigned; 1044 } 1045 } 1046 } 1047 1048 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1049 unsigned Opc = MI.getOpcode(); 1050 switch (Opc) { 1051 default: 1052 return false; 1053 case AArch64::SEH_StackAlloc: 1054 case AArch64::SEH_SaveFPLR: 1055 case AArch64::SEH_SaveFPLR_X: 1056 case AArch64::SEH_SaveReg: 1057 case AArch64::SEH_SaveReg_X: 1058 case AArch64::SEH_SaveRegP: 1059 case AArch64::SEH_SaveRegP_X: 1060 case AArch64::SEH_SaveFReg: 1061 case AArch64::SEH_SaveFReg_X: 1062 case AArch64::SEH_SaveFRegP: 1063 case AArch64::SEH_SaveFRegP_X: 1064 case AArch64::SEH_SetFP: 1065 case AArch64::SEH_AddFP: 1066 case AArch64::SEH_Nop: 1067 case AArch64::SEH_PrologEnd: 1068 case AArch64::SEH_EpilogStart: 1069 case AArch64::SEH_EpilogEnd: 1070 case AArch64::SEH_PACSignLR: 1071 return true; 1072 } 1073 } 1074 1075 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1076 Register &SrcReg, Register &DstReg, 1077 unsigned &SubIdx) const { 1078 switch (MI.getOpcode()) { 1079 default: 1080 return false; 1081 case AArch64::SBFMXri: // aka sxtw 1082 case AArch64::UBFMXri: // aka uxtw 1083 // Check for the 32 -> 64 bit extension case, these instructions can do 1084 // much more. 1085 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1086 return false; 1087 // This is a signed or unsigned 32 -> 64 bit extension. 1088 SrcReg = MI.getOperand(1).getReg(); 1089 DstReg = MI.getOperand(0).getReg(); 1090 SubIdx = AArch64::sub_32; 1091 return true; 1092 } 1093 } 1094 1095 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1096 const MachineInstr &MIa, const MachineInstr &MIb) const { 1097 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1098 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1099 int64_t OffsetA = 0, OffsetB = 0; 1100 TypeSize WidthA(0, false), WidthB(0, false); 1101 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1102 1103 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1104 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1105 1106 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1107 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1108 return false; 1109 1110 // Retrieve the base, offset from the base and width. Width 1111 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1112 // base are identical, and the offset of a lower memory access + 1113 // the width doesn't overlap the offset of a higher memory access, 1114 // then the memory accesses are different. 1115 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1116 // are assumed to have the same scale (vscale). 1117 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1118 WidthA, TRI) && 1119 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1120 WidthB, TRI)) { 1121 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1122 OffsetAIsScalable == OffsetBIsScalable) { 1123 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1124 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1125 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1126 if (LowWidth.isScalable() == OffsetAIsScalable && 1127 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset) 1128 return true; 1129 } 1130 } 1131 return false; 1132 } 1133 1134 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1135 const MachineBasicBlock *MBB, 1136 const MachineFunction &MF) const { 1137 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1138 return true; 1139 1140 // Do not move an instruction that can be recognized as a branch target. 1141 if (hasBTISemantics(MI)) 1142 return true; 1143 1144 switch (MI.getOpcode()) { 1145 case AArch64::HINT: 1146 // CSDB hints are scheduling barriers. 1147 if (MI.getOperand(0).getImm() == 0x14) 1148 return true; 1149 break; 1150 case AArch64::DSB: 1151 case AArch64::ISB: 1152 // DSB and ISB also are scheduling barriers. 1153 return true; 1154 case AArch64::MSRpstatesvcrImm1: 1155 // SMSTART and SMSTOP are also scheduling barriers. 1156 return true; 1157 default:; 1158 } 1159 if (isSEHInstruction(MI)) 1160 return true; 1161 auto Next = std::next(MI.getIterator()); 1162 return Next != MBB->end() && Next->isCFIInstruction(); 1163 } 1164 1165 /// analyzeCompare - For a comparison instruction, return the source registers 1166 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1167 /// Return true if the comparison instruction can be analyzed. 1168 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1169 Register &SrcReg2, int64_t &CmpMask, 1170 int64_t &CmpValue) const { 1171 // The first operand can be a frame index where we'd normally expect a 1172 // register. 1173 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1174 if (!MI.getOperand(1).isReg()) 1175 return false; 1176 1177 switch (MI.getOpcode()) { 1178 default: 1179 break; 1180 case AArch64::PTEST_PP: 1181 case AArch64::PTEST_PP_ANY: 1182 SrcReg = MI.getOperand(0).getReg(); 1183 SrcReg2 = MI.getOperand(1).getReg(); 1184 // Not sure about the mask and value for now... 1185 CmpMask = ~0; 1186 CmpValue = 0; 1187 return true; 1188 case AArch64::SUBSWrr: 1189 case AArch64::SUBSWrs: 1190 case AArch64::SUBSWrx: 1191 case AArch64::SUBSXrr: 1192 case AArch64::SUBSXrs: 1193 case AArch64::SUBSXrx: 1194 case AArch64::ADDSWrr: 1195 case AArch64::ADDSWrs: 1196 case AArch64::ADDSWrx: 1197 case AArch64::ADDSXrr: 1198 case AArch64::ADDSXrs: 1199 case AArch64::ADDSXrx: 1200 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1201 SrcReg = MI.getOperand(1).getReg(); 1202 SrcReg2 = MI.getOperand(2).getReg(); 1203 CmpMask = ~0; 1204 CmpValue = 0; 1205 return true; 1206 case AArch64::SUBSWri: 1207 case AArch64::ADDSWri: 1208 case AArch64::SUBSXri: 1209 case AArch64::ADDSXri: 1210 SrcReg = MI.getOperand(1).getReg(); 1211 SrcReg2 = 0; 1212 CmpMask = ~0; 1213 CmpValue = MI.getOperand(2).getImm(); 1214 return true; 1215 case AArch64::ANDSWri: 1216 case AArch64::ANDSXri: 1217 // ANDS does not use the same encoding scheme as the others xxxS 1218 // instructions. 1219 SrcReg = MI.getOperand(1).getReg(); 1220 SrcReg2 = 0; 1221 CmpMask = ~0; 1222 CmpValue = AArch64_AM::decodeLogicalImmediate( 1223 MI.getOperand(2).getImm(), 1224 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1225 return true; 1226 } 1227 1228 return false; 1229 } 1230 1231 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1232 MachineBasicBlock *MBB = Instr.getParent(); 1233 assert(MBB && "Can't get MachineBasicBlock here"); 1234 MachineFunction *MF = MBB->getParent(); 1235 assert(MF && "Can't get MachineFunction here"); 1236 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1237 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1238 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1239 1240 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1241 ++OpIdx) { 1242 MachineOperand &MO = Instr.getOperand(OpIdx); 1243 const TargetRegisterClass *OpRegCstraints = 1244 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1245 1246 // If there's no constraint, there's nothing to do. 1247 if (!OpRegCstraints) 1248 continue; 1249 // If the operand is a frame index, there's nothing to do here. 1250 // A frame index operand will resolve correctly during PEI. 1251 if (MO.isFI()) 1252 continue; 1253 1254 assert(MO.isReg() && 1255 "Operand has register constraints without being a register!"); 1256 1257 Register Reg = MO.getReg(); 1258 if (Reg.isPhysical()) { 1259 if (!OpRegCstraints->contains(Reg)) 1260 return false; 1261 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1262 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1263 return false; 1264 } 1265 1266 return true; 1267 } 1268 1269 /// Return the opcode that does not set flags when possible - otherwise 1270 /// return the original opcode. The caller is responsible to do the actual 1271 /// substitution and legality checking. 1272 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1273 // Don't convert all compare instructions, because for some the zero register 1274 // encoding becomes the sp register. 1275 bool MIDefinesZeroReg = false; 1276 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1277 MIDefinesZeroReg = true; 1278 1279 switch (MI.getOpcode()) { 1280 default: 1281 return MI.getOpcode(); 1282 case AArch64::ADDSWrr: 1283 return AArch64::ADDWrr; 1284 case AArch64::ADDSWri: 1285 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1286 case AArch64::ADDSWrs: 1287 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1288 case AArch64::ADDSWrx: 1289 return AArch64::ADDWrx; 1290 case AArch64::ADDSXrr: 1291 return AArch64::ADDXrr; 1292 case AArch64::ADDSXri: 1293 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1294 case AArch64::ADDSXrs: 1295 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1296 case AArch64::ADDSXrx: 1297 return AArch64::ADDXrx; 1298 case AArch64::SUBSWrr: 1299 return AArch64::SUBWrr; 1300 case AArch64::SUBSWri: 1301 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1302 case AArch64::SUBSWrs: 1303 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1304 case AArch64::SUBSWrx: 1305 return AArch64::SUBWrx; 1306 case AArch64::SUBSXrr: 1307 return AArch64::SUBXrr; 1308 case AArch64::SUBSXri: 1309 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1310 case AArch64::SUBSXrs: 1311 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1312 case AArch64::SUBSXrx: 1313 return AArch64::SUBXrx; 1314 } 1315 } 1316 1317 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1318 1319 /// True when condition flags are accessed (either by writing or reading) 1320 /// on the instruction trace starting at From and ending at To. 1321 /// 1322 /// Note: If From and To are from different blocks it's assumed CC are accessed 1323 /// on the path. 1324 static bool areCFlagsAccessedBetweenInstrs( 1325 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1326 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1327 // Early exit if To is at the beginning of the BB. 1328 if (To == To->getParent()->begin()) 1329 return true; 1330 1331 // Check whether the instructions are in the same basic block 1332 // If not, assume the condition flags might get modified somewhere. 1333 if (To->getParent() != From->getParent()) 1334 return true; 1335 1336 // From must be above To. 1337 assert(std::any_of( 1338 ++To.getReverse(), To->getParent()->rend(), 1339 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1340 1341 // We iterate backward starting at \p To until we hit \p From. 1342 for (const MachineInstr &Instr : 1343 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1344 if (((AccessToCheck & AK_Write) && 1345 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1346 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1347 return true; 1348 } 1349 return false; 1350 } 1351 1352 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1353 /// operation which could set the flags in an identical manner 1354 bool AArch64InstrInfo::optimizePTestInstr( 1355 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1356 const MachineRegisterInfo *MRI) const { 1357 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1358 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1359 auto NewOp = Pred->getOpcode(); 1360 bool OpChanged = false; 1361 1362 unsigned MaskOpcode = Mask->getOpcode(); 1363 unsigned PredOpcode = Pred->getOpcode(); 1364 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1365 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1366 1367 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && 1368 getElementSizeForOpcode(MaskOpcode) == 1369 getElementSizeForOpcode(PredOpcode) && 1370 Mask->getOperand(1).getImm() == 31) { 1371 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1372 // redundant since WHILE performs an implicit PTEST with an all active 1373 // mask. Must be an all active predicate of matching element size. 1374 1375 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1376 // PTEST_LIKE instruction uses the same all active mask and the element 1377 // size matches. If the PTEST has a condition of any then it is always 1378 // redundant. 1379 if (PredIsPTestLike) { 1380 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1381 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) 1382 return false; 1383 } 1384 1385 // Fallthough to simply remove the PTEST. 1386 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && 1387 PTest->getOpcode() == AArch64::PTEST_PP_ANY) { 1388 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1389 // instruction that sets the flags as PTEST would. This is only valid when 1390 // the condition is any. 1391 1392 // Fallthough to simply remove the PTEST. 1393 } else if (PredIsPTestLike) { 1394 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1395 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1396 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1397 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1398 // performed by the compare could consider fewer lanes for these element 1399 // sizes. 1400 // 1401 // For example, consider 1402 // 1403 // ptrue p0.b ; P0=1111-1111-1111-1111 1404 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1405 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1406 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1407 // ; ^ last active 1408 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1409 // ; ^ last active 1410 // 1411 // where the compare generates a canonical all active 32-bit predicate 1412 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1413 // active flag, whereas the PTEST instruction with the same mask doesn't. 1414 // For PTEST_ANY this doesn't apply as the flags in this case would be 1415 // identical regardless of element size. 1416 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1417 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1418 if ((Mask != PTestLikeMask) || 1419 (PredElementSize != AArch64::ElementSizeB && 1420 PTest->getOpcode() != AArch64::PTEST_PP_ANY)) 1421 return false; 1422 1423 // Fallthough to simply remove the PTEST. 1424 } else { 1425 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1426 // opcode so the PTEST becomes redundant. 1427 switch (PredOpcode) { 1428 case AArch64::AND_PPzPP: 1429 case AArch64::BIC_PPzPP: 1430 case AArch64::EOR_PPzPP: 1431 case AArch64::NAND_PPzPP: 1432 case AArch64::NOR_PPzPP: 1433 case AArch64::ORN_PPzPP: 1434 case AArch64::ORR_PPzPP: 1435 case AArch64::BRKA_PPzP: 1436 case AArch64::BRKPA_PPzPP: 1437 case AArch64::BRKB_PPzP: 1438 case AArch64::BRKPB_PPzPP: 1439 case AArch64::RDFFR_PPz: { 1440 // Check to see if our mask is the same. If not the resulting flag bits 1441 // may be different and we can't remove the ptest. 1442 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1443 if (Mask != PredMask) 1444 return false; 1445 break; 1446 } 1447 case AArch64::BRKN_PPzP: { 1448 // BRKN uses an all active implicit mask to set flags unlike the other 1449 // flag-setting instructions. 1450 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1451 if ((MaskOpcode != AArch64::PTRUE_B) || 1452 (Mask->getOperand(1).getImm() != 31)) 1453 return false; 1454 break; 1455 } 1456 case AArch64::PTRUE_B: 1457 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1458 break; 1459 default: 1460 // Bail out if we don't recognize the input 1461 return false; 1462 } 1463 1464 NewOp = convertToFlagSettingOpc(PredOpcode); 1465 OpChanged = true; 1466 } 1467 1468 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1469 1470 // If another instruction between Pred and PTest accesses flags, don't remove 1471 // the ptest or update the earlier instruction to modify them. 1472 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1473 return false; 1474 1475 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1476 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1477 // operand to be replaced with an equivalent instruction that also sets the 1478 // flags. 1479 Pred->setDesc(get(NewOp)); 1480 PTest->eraseFromParent(); 1481 if (OpChanged) { 1482 bool succeeded = UpdateOperandRegClass(*Pred); 1483 (void)succeeded; 1484 assert(succeeded && "Operands have incompatible register classes!"); 1485 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1486 } 1487 1488 // Ensure that the flags def is live. 1489 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1490 unsigned i = 0, e = Pred->getNumOperands(); 1491 for (; i != e; ++i) { 1492 MachineOperand &MO = Pred->getOperand(i); 1493 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1494 MO.setIsDead(false); 1495 break; 1496 } 1497 } 1498 } 1499 return true; 1500 } 1501 1502 /// Try to optimize a compare instruction. A compare instruction is an 1503 /// instruction which produces AArch64::NZCV. It can be truly compare 1504 /// instruction 1505 /// when there are no uses of its destination register. 1506 /// 1507 /// The following steps are tried in order: 1508 /// 1. Convert CmpInstr into an unconditional version. 1509 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1510 /// condition code or an instruction which can be converted into such an 1511 /// instruction. 1512 /// Only comparison with zero is supported. 1513 bool AArch64InstrInfo::optimizeCompareInstr( 1514 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1515 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1516 assert(CmpInstr.getParent()); 1517 assert(MRI); 1518 1519 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1520 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1521 if (DeadNZCVIdx != -1) { 1522 if (CmpInstr.definesRegister(AArch64::WZR) || 1523 CmpInstr.definesRegister(AArch64::XZR)) { 1524 CmpInstr.eraseFromParent(); 1525 return true; 1526 } 1527 unsigned Opc = CmpInstr.getOpcode(); 1528 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1529 if (NewOpc == Opc) 1530 return false; 1531 const MCInstrDesc &MCID = get(NewOpc); 1532 CmpInstr.setDesc(MCID); 1533 CmpInstr.removeOperand(DeadNZCVIdx); 1534 bool succeeded = UpdateOperandRegClass(CmpInstr); 1535 (void)succeeded; 1536 assert(succeeded && "Some operands reg class are incompatible!"); 1537 return true; 1538 } 1539 1540 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1541 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1542 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1543 1544 if (SrcReg2 != 0) 1545 return false; 1546 1547 // CmpInstr is a Compare instruction if destination register is not used. 1548 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1549 return false; 1550 1551 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1552 return true; 1553 return (CmpValue == 0 || CmpValue == 1) && 1554 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1555 } 1556 1557 /// Get opcode of S version of Instr. 1558 /// If Instr is S version its opcode is returned. 1559 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1560 /// or we are not interested in it. 1561 static unsigned sForm(MachineInstr &Instr) { 1562 switch (Instr.getOpcode()) { 1563 default: 1564 return AArch64::INSTRUCTION_LIST_END; 1565 1566 case AArch64::ADDSWrr: 1567 case AArch64::ADDSWri: 1568 case AArch64::ADDSXrr: 1569 case AArch64::ADDSXri: 1570 case AArch64::SUBSWrr: 1571 case AArch64::SUBSWri: 1572 case AArch64::SUBSXrr: 1573 case AArch64::SUBSXri: 1574 return Instr.getOpcode(); 1575 1576 case AArch64::ADDWrr: 1577 return AArch64::ADDSWrr; 1578 case AArch64::ADDWri: 1579 return AArch64::ADDSWri; 1580 case AArch64::ADDXrr: 1581 return AArch64::ADDSXrr; 1582 case AArch64::ADDXri: 1583 return AArch64::ADDSXri; 1584 case AArch64::ADCWr: 1585 return AArch64::ADCSWr; 1586 case AArch64::ADCXr: 1587 return AArch64::ADCSXr; 1588 case AArch64::SUBWrr: 1589 return AArch64::SUBSWrr; 1590 case AArch64::SUBWri: 1591 return AArch64::SUBSWri; 1592 case AArch64::SUBXrr: 1593 return AArch64::SUBSXrr; 1594 case AArch64::SUBXri: 1595 return AArch64::SUBSXri; 1596 case AArch64::SBCWr: 1597 return AArch64::SBCSWr; 1598 case AArch64::SBCXr: 1599 return AArch64::SBCSXr; 1600 case AArch64::ANDWri: 1601 return AArch64::ANDSWri; 1602 case AArch64::ANDXri: 1603 return AArch64::ANDSXri; 1604 } 1605 } 1606 1607 /// Check if AArch64::NZCV should be alive in successors of MBB. 1608 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1609 for (auto *BB : MBB->successors()) 1610 if (BB->isLiveIn(AArch64::NZCV)) 1611 return true; 1612 return false; 1613 } 1614 1615 /// \returns The condition code operand index for \p Instr if it is a branch 1616 /// or select and -1 otherwise. 1617 static int 1618 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1619 switch (Instr.getOpcode()) { 1620 default: 1621 return -1; 1622 1623 case AArch64::Bcc: { 1624 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1625 assert(Idx >= 2); 1626 return Idx - 2; 1627 } 1628 1629 case AArch64::CSINVWr: 1630 case AArch64::CSINVXr: 1631 case AArch64::CSINCWr: 1632 case AArch64::CSINCXr: 1633 case AArch64::CSELWr: 1634 case AArch64::CSELXr: 1635 case AArch64::CSNEGWr: 1636 case AArch64::CSNEGXr: 1637 case AArch64::FCSELSrrr: 1638 case AArch64::FCSELDrrr: { 1639 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1640 assert(Idx >= 1); 1641 return Idx - 1; 1642 } 1643 } 1644 } 1645 1646 /// Find a condition code used by the instruction. 1647 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1648 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1649 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1650 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1651 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1652 Instr.getOperand(CCIdx).getImm()) 1653 : AArch64CC::Invalid; 1654 } 1655 1656 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1657 assert(CC != AArch64CC::Invalid); 1658 UsedNZCV UsedFlags; 1659 switch (CC) { 1660 default: 1661 break; 1662 1663 case AArch64CC::EQ: // Z set 1664 case AArch64CC::NE: // Z clear 1665 UsedFlags.Z = true; 1666 break; 1667 1668 case AArch64CC::HI: // Z clear and C set 1669 case AArch64CC::LS: // Z set or C clear 1670 UsedFlags.Z = true; 1671 [[fallthrough]]; 1672 case AArch64CC::HS: // C set 1673 case AArch64CC::LO: // C clear 1674 UsedFlags.C = true; 1675 break; 1676 1677 case AArch64CC::MI: // N set 1678 case AArch64CC::PL: // N clear 1679 UsedFlags.N = true; 1680 break; 1681 1682 case AArch64CC::VS: // V set 1683 case AArch64CC::VC: // V clear 1684 UsedFlags.V = true; 1685 break; 1686 1687 case AArch64CC::GT: // Z clear, N and V the same 1688 case AArch64CC::LE: // Z set, N and V differ 1689 UsedFlags.Z = true; 1690 [[fallthrough]]; 1691 case AArch64CC::GE: // N and V the same 1692 case AArch64CC::LT: // N and V differ 1693 UsedFlags.N = true; 1694 UsedFlags.V = true; 1695 break; 1696 } 1697 return UsedFlags; 1698 } 1699 1700 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1701 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1702 /// \returns std::nullopt otherwise. 1703 /// 1704 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1705 std::optional<UsedNZCV> 1706 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1707 const TargetRegisterInfo &TRI, 1708 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1709 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1710 if (MI.getParent() != CmpParent) 1711 return std::nullopt; 1712 1713 if (areCFlagsAliveInSuccessors(CmpParent)) 1714 return std::nullopt; 1715 1716 UsedNZCV NZCVUsedAfterCmp; 1717 for (MachineInstr &Instr : instructionsWithoutDebug( 1718 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1719 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1720 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1721 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1722 return std::nullopt; 1723 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1724 if (CCUseInstrs) 1725 CCUseInstrs->push_back(&Instr); 1726 } 1727 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1728 break; 1729 } 1730 return NZCVUsedAfterCmp; 1731 } 1732 1733 static bool isADDSRegImm(unsigned Opcode) { 1734 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1735 } 1736 1737 static bool isSUBSRegImm(unsigned Opcode) { 1738 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1739 } 1740 1741 /// Check if CmpInstr can be substituted by MI. 1742 /// 1743 /// CmpInstr can be substituted: 1744 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1745 /// - and, MI and CmpInstr are from the same MachineBB 1746 /// - and, condition flags are not alive in successors of the CmpInstr parent 1747 /// - and, if MI opcode is the S form there must be no defs of flags between 1748 /// MI and CmpInstr 1749 /// or if MI opcode is not the S form there must be neither defs of flags 1750 /// nor uses of flags between MI and CmpInstr. 1751 /// - and, if C/V flags are not used after CmpInstr 1752 /// or if N flag is used but MI produces poison value if signed overflow 1753 /// occurs. 1754 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1755 const TargetRegisterInfo &TRI) { 1756 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1757 // that may or may not set flags. 1758 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1759 1760 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1761 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1762 return false; 1763 1764 assert((CmpInstr.getOperand(2).isImm() && 1765 CmpInstr.getOperand(2).getImm() == 0) && 1766 "Caller guarantees that CmpInstr compares with constant 0"); 1767 1768 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1769 if (!NZVCUsed || NZVCUsed->C) 1770 return false; 1771 1772 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1773 // '%vreg = add ...' or '%vreg = sub ...'. 1774 // Condition flag V is used to indicate signed overflow. 1775 // 1) MI and CmpInstr set N and V to the same value. 1776 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1777 // signed overflow occurs, so CmpInstr could still be simplified away. 1778 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1779 return false; 1780 1781 AccessKind AccessToCheck = AK_Write; 1782 if (sForm(MI) != MI.getOpcode()) 1783 AccessToCheck = AK_All; 1784 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1785 } 1786 1787 /// Substitute an instruction comparing to zero with another instruction 1788 /// which produces needed condition flags. 1789 /// 1790 /// Return true on success. 1791 bool AArch64InstrInfo::substituteCmpToZero( 1792 MachineInstr &CmpInstr, unsigned SrcReg, 1793 const MachineRegisterInfo &MRI) const { 1794 // Get the unique definition of SrcReg. 1795 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1796 if (!MI) 1797 return false; 1798 1799 const TargetRegisterInfo &TRI = getRegisterInfo(); 1800 1801 unsigned NewOpc = sForm(*MI); 1802 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1803 return false; 1804 1805 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1806 return false; 1807 1808 // Update the instruction to set NZCV. 1809 MI->setDesc(get(NewOpc)); 1810 CmpInstr.eraseFromParent(); 1811 bool succeeded = UpdateOperandRegClass(*MI); 1812 (void)succeeded; 1813 assert(succeeded && "Some operands reg class are incompatible!"); 1814 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1815 return true; 1816 } 1817 1818 /// \returns True if \p CmpInstr can be removed. 1819 /// 1820 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1821 /// codes used in \p CCUseInstrs must be inverted. 1822 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1823 int CmpValue, const TargetRegisterInfo &TRI, 1824 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1825 bool &IsInvertCC) { 1826 assert((CmpValue == 0 || CmpValue == 1) && 1827 "Only comparisons to 0 or 1 considered for removal!"); 1828 1829 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1830 unsigned MIOpc = MI.getOpcode(); 1831 if (MIOpc == AArch64::CSINCWr) { 1832 if (MI.getOperand(1).getReg() != AArch64::WZR || 1833 MI.getOperand(2).getReg() != AArch64::WZR) 1834 return false; 1835 } else if (MIOpc == AArch64::CSINCXr) { 1836 if (MI.getOperand(1).getReg() != AArch64::XZR || 1837 MI.getOperand(2).getReg() != AArch64::XZR) 1838 return false; 1839 } else { 1840 return false; 1841 } 1842 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1843 if (MICC == AArch64CC::Invalid) 1844 return false; 1845 1846 // NZCV needs to be defined 1847 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1848 return false; 1849 1850 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1851 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1852 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1853 if (CmpValue && !IsSubsRegImm) 1854 return false; 1855 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1856 return false; 1857 1858 // MI conditions allowed: eq, ne, mi, pl 1859 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1860 if (MIUsedNZCV.C || MIUsedNZCV.V) 1861 return false; 1862 1863 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1864 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1865 // Condition flags are not used in CmpInstr basic block successors and only 1866 // Z or N flags allowed to be used after CmpInstr within its basic block 1867 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1868 return false; 1869 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1870 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1871 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1872 return false; 1873 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1874 if (MIUsedNZCV.N && !CmpValue) 1875 return false; 1876 1877 // There must be no defs of flags between MI and CmpInstr 1878 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1879 return false; 1880 1881 // Condition code is inverted in the following cases: 1882 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1883 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1884 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1885 (!CmpValue && MICC == AArch64CC::NE); 1886 return true; 1887 } 1888 1889 /// Remove comparison in csinc-cmp sequence 1890 /// 1891 /// Examples: 1892 /// 1. \code 1893 /// csinc w9, wzr, wzr, ne 1894 /// cmp w9, #0 1895 /// b.eq 1896 /// \endcode 1897 /// to 1898 /// \code 1899 /// csinc w9, wzr, wzr, ne 1900 /// b.ne 1901 /// \endcode 1902 /// 1903 /// 2. \code 1904 /// csinc x2, xzr, xzr, mi 1905 /// cmp x2, #1 1906 /// b.pl 1907 /// \endcode 1908 /// to 1909 /// \code 1910 /// csinc x2, xzr, xzr, mi 1911 /// b.pl 1912 /// \endcode 1913 /// 1914 /// \param CmpInstr comparison instruction 1915 /// \return True when comparison removed 1916 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1917 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1918 const MachineRegisterInfo &MRI) const { 1919 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1920 if (!MI) 1921 return false; 1922 const TargetRegisterInfo &TRI = getRegisterInfo(); 1923 SmallVector<MachineInstr *, 4> CCUseInstrs; 1924 bool IsInvertCC = false; 1925 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1926 IsInvertCC)) 1927 return false; 1928 // Make transformation 1929 CmpInstr.eraseFromParent(); 1930 if (IsInvertCC) { 1931 // Invert condition codes in CmpInstr CC users 1932 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1933 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1934 assert(Idx >= 0 && "Unexpected instruction using CC."); 1935 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1936 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1937 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1938 CCOperand.setImm(CCUse); 1939 } 1940 } 1941 return true; 1942 } 1943 1944 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1945 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1946 MI.getOpcode() != AArch64::CATCHRET) 1947 return false; 1948 1949 MachineBasicBlock &MBB = *MI.getParent(); 1950 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1951 auto TRI = Subtarget.getRegisterInfo(); 1952 DebugLoc DL = MI.getDebugLoc(); 1953 1954 if (MI.getOpcode() == AArch64::CATCHRET) { 1955 // Skip to the first instruction before the epilog. 1956 const TargetInstrInfo *TII = 1957 MBB.getParent()->getSubtarget().getInstrInfo(); 1958 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1959 auto MBBI = MachineBasicBlock::iterator(MI); 1960 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1961 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1962 FirstEpilogSEH != MBB.begin()) 1963 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1964 if (FirstEpilogSEH != MBB.begin()) 1965 FirstEpilogSEH = std::next(FirstEpilogSEH); 1966 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1967 .addReg(AArch64::X0, RegState::Define) 1968 .addMBB(TargetMBB); 1969 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1970 .addReg(AArch64::X0, RegState::Define) 1971 .addReg(AArch64::X0) 1972 .addMBB(TargetMBB) 1973 .addImm(0); 1974 return true; 1975 } 1976 1977 Register Reg = MI.getOperand(0).getReg(); 1978 Module &M = *MBB.getParent()->getFunction().getParent(); 1979 if (M.getStackProtectorGuard() == "sysreg") { 1980 const AArch64SysReg::SysReg *SrcReg = 1981 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1982 if (!SrcReg) 1983 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1984 1985 // mrs xN, sysreg 1986 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1987 .addDef(Reg, RegState::Renamable) 1988 .addImm(SrcReg->Encoding); 1989 int Offset = M.getStackProtectorGuardOffset(); 1990 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1991 // ldr xN, [xN, #offset] 1992 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1993 .addDef(Reg) 1994 .addUse(Reg, RegState::Kill) 1995 .addImm(Offset / 8); 1996 } else if (Offset >= -256 && Offset <= 255) { 1997 // ldur xN, [xN, #offset] 1998 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1999 .addDef(Reg) 2000 .addUse(Reg, RegState::Kill) 2001 .addImm(Offset); 2002 } else if (Offset >= -4095 && Offset <= 4095) { 2003 if (Offset > 0) { 2004 // add xN, xN, #offset 2005 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 2006 .addDef(Reg) 2007 .addUse(Reg, RegState::Kill) 2008 .addImm(Offset) 2009 .addImm(0); 2010 } else { 2011 // sub xN, xN, #offset 2012 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 2013 .addDef(Reg) 2014 .addUse(Reg, RegState::Kill) 2015 .addImm(-Offset) 2016 .addImm(0); 2017 } 2018 // ldr xN, [xN] 2019 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2020 .addDef(Reg) 2021 .addUse(Reg, RegState::Kill) 2022 .addImm(0); 2023 } else { 2024 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 2025 // than 23760. 2026 // It might be nice to use AArch64::MOVi32imm here, which would get 2027 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 2028 // contains the MRS result. findScratchNonCalleeSaveRegister() in 2029 // AArch64FrameLowering might help us find such a scratch register 2030 // though. If we failed to find a scratch register, we could emit a 2031 // stream of add instructions to build up the immediate. Or, we could try 2032 // to insert a AArch64::MOVi32imm before register allocation so that we 2033 // didn't need to scavenge for a scratch register. 2034 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 2035 } 2036 MBB.erase(MI); 2037 return true; 2038 } 2039 2040 const GlobalValue *GV = 2041 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 2042 const TargetMachine &TM = MBB.getParent()->getTarget(); 2043 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 2044 const unsigned char MO_NC = AArch64II::MO_NC; 2045 2046 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2047 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 2048 .addGlobalAddress(GV, 0, OpFlags); 2049 if (Subtarget.isTargetILP32()) { 2050 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2051 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2052 .addDef(Reg32, RegState::Dead) 2053 .addUse(Reg, RegState::Kill) 2054 .addImm(0) 2055 .addMemOperand(*MI.memoperands_begin()) 2056 .addDef(Reg, RegState::Implicit); 2057 } else { 2058 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2059 .addReg(Reg, RegState::Kill) 2060 .addImm(0) 2061 .addMemOperand(*MI.memoperands_begin()); 2062 } 2063 } else if (TM.getCodeModel() == CodeModel::Large) { 2064 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2065 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2066 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2067 .addImm(0); 2068 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2069 .addReg(Reg, RegState::Kill) 2070 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2071 .addImm(16); 2072 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2073 .addReg(Reg, RegState::Kill) 2074 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2075 .addImm(32); 2076 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2077 .addReg(Reg, RegState::Kill) 2078 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2079 .addImm(48); 2080 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2081 .addReg(Reg, RegState::Kill) 2082 .addImm(0) 2083 .addMemOperand(*MI.memoperands_begin()); 2084 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2085 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2086 .addGlobalAddress(GV, 0, OpFlags); 2087 } else { 2088 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2089 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2090 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2091 if (Subtarget.isTargetILP32()) { 2092 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2093 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2094 .addDef(Reg32, RegState::Dead) 2095 .addUse(Reg, RegState::Kill) 2096 .addGlobalAddress(GV, 0, LoFlags) 2097 .addMemOperand(*MI.memoperands_begin()) 2098 .addDef(Reg, RegState::Implicit); 2099 } else { 2100 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2101 .addReg(Reg, RegState::Kill) 2102 .addGlobalAddress(GV, 0, LoFlags) 2103 .addMemOperand(*MI.memoperands_begin()); 2104 } 2105 } 2106 2107 MBB.erase(MI); 2108 2109 return true; 2110 } 2111 2112 // Return true if this instruction simply sets its single destination register 2113 // to zero. This is equivalent to a register rename of the zero-register. 2114 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2115 switch (MI.getOpcode()) { 2116 default: 2117 break; 2118 case AArch64::MOVZWi: 2119 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2120 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2121 assert(MI.getDesc().getNumOperands() == 3 && 2122 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2123 return true; 2124 } 2125 break; 2126 case AArch64::ANDWri: // and Rd, Rzr, #imm 2127 return MI.getOperand(1).getReg() == AArch64::WZR; 2128 case AArch64::ANDXri: 2129 return MI.getOperand(1).getReg() == AArch64::XZR; 2130 case TargetOpcode::COPY: 2131 return MI.getOperand(1).getReg() == AArch64::WZR; 2132 } 2133 return false; 2134 } 2135 2136 // Return true if this instruction simply renames a general register without 2137 // modifying bits. 2138 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2139 switch (MI.getOpcode()) { 2140 default: 2141 break; 2142 case TargetOpcode::COPY: { 2143 // GPR32 copies will by lowered to ORRXrs 2144 Register DstReg = MI.getOperand(0).getReg(); 2145 return (AArch64::GPR32RegClass.contains(DstReg) || 2146 AArch64::GPR64RegClass.contains(DstReg)); 2147 } 2148 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2149 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2150 assert(MI.getDesc().getNumOperands() == 4 && 2151 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2152 return true; 2153 } 2154 break; 2155 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2156 if (MI.getOperand(2).getImm() == 0) { 2157 assert(MI.getDesc().getNumOperands() == 4 && 2158 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2159 return true; 2160 } 2161 break; 2162 } 2163 return false; 2164 } 2165 2166 // Return true if this instruction simply renames a general register without 2167 // modifying bits. 2168 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2169 switch (MI.getOpcode()) { 2170 default: 2171 break; 2172 case TargetOpcode::COPY: { 2173 Register DstReg = MI.getOperand(0).getReg(); 2174 return AArch64::FPR128RegClass.contains(DstReg); 2175 } 2176 case AArch64::ORRv16i8: 2177 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2178 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2179 "invalid ORRv16i8 operands"); 2180 return true; 2181 } 2182 break; 2183 } 2184 return false; 2185 } 2186 2187 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2188 int &FrameIndex) const { 2189 switch (MI.getOpcode()) { 2190 default: 2191 break; 2192 case AArch64::LDRWui: 2193 case AArch64::LDRXui: 2194 case AArch64::LDRBui: 2195 case AArch64::LDRHui: 2196 case AArch64::LDRSui: 2197 case AArch64::LDRDui: 2198 case AArch64::LDRQui: 2199 case AArch64::LDR_PXI: 2200 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2201 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2202 FrameIndex = MI.getOperand(1).getIndex(); 2203 return MI.getOperand(0).getReg(); 2204 } 2205 break; 2206 } 2207 2208 return 0; 2209 } 2210 2211 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2212 int &FrameIndex) const { 2213 switch (MI.getOpcode()) { 2214 default: 2215 break; 2216 case AArch64::STRWui: 2217 case AArch64::STRXui: 2218 case AArch64::STRBui: 2219 case AArch64::STRHui: 2220 case AArch64::STRSui: 2221 case AArch64::STRDui: 2222 case AArch64::STRQui: 2223 case AArch64::STR_PXI: 2224 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2225 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2226 FrameIndex = MI.getOperand(1).getIndex(); 2227 return MI.getOperand(0).getReg(); 2228 } 2229 break; 2230 } 2231 return 0; 2232 } 2233 2234 /// Check all MachineMemOperands for a hint to suppress pairing. 2235 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2236 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2237 return MMO->getFlags() & MOSuppressPair; 2238 }); 2239 } 2240 2241 /// Set a flag on the first MachineMemOperand to suppress pairing. 2242 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2243 if (MI.memoperands_empty()) 2244 return; 2245 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2246 } 2247 2248 /// Check all MachineMemOperands for a hint that the load/store is strided. 2249 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2250 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2251 return MMO->getFlags() & MOStridedAccess; 2252 }); 2253 } 2254 2255 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2256 switch (Opc) { 2257 default: 2258 return false; 2259 case AArch64::STURSi: 2260 case AArch64::STRSpre: 2261 case AArch64::STURDi: 2262 case AArch64::STRDpre: 2263 case AArch64::STURQi: 2264 case AArch64::STRQpre: 2265 case AArch64::STURBBi: 2266 case AArch64::STURHHi: 2267 case AArch64::STURWi: 2268 case AArch64::STRWpre: 2269 case AArch64::STURXi: 2270 case AArch64::STRXpre: 2271 case AArch64::LDURSi: 2272 case AArch64::LDRSpre: 2273 case AArch64::LDURDi: 2274 case AArch64::LDRDpre: 2275 case AArch64::LDURQi: 2276 case AArch64::LDRQpre: 2277 case AArch64::LDURWi: 2278 case AArch64::LDRWpre: 2279 case AArch64::LDURXi: 2280 case AArch64::LDRXpre: 2281 case AArch64::LDRSWpre: 2282 case AArch64::LDURSWi: 2283 case AArch64::LDURHHi: 2284 case AArch64::LDURBBi: 2285 case AArch64::LDURSBWi: 2286 case AArch64::LDURSHWi: 2287 return true; 2288 } 2289 } 2290 2291 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2292 switch (Opc) { 2293 default: return {}; 2294 case AArch64::PRFMui: return AArch64::PRFUMi; 2295 case AArch64::LDRXui: return AArch64::LDURXi; 2296 case AArch64::LDRWui: return AArch64::LDURWi; 2297 case AArch64::LDRBui: return AArch64::LDURBi; 2298 case AArch64::LDRHui: return AArch64::LDURHi; 2299 case AArch64::LDRSui: return AArch64::LDURSi; 2300 case AArch64::LDRDui: return AArch64::LDURDi; 2301 case AArch64::LDRQui: return AArch64::LDURQi; 2302 case AArch64::LDRBBui: return AArch64::LDURBBi; 2303 case AArch64::LDRHHui: return AArch64::LDURHHi; 2304 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2305 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2306 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2307 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2308 case AArch64::LDRSWui: return AArch64::LDURSWi; 2309 case AArch64::STRXui: return AArch64::STURXi; 2310 case AArch64::STRWui: return AArch64::STURWi; 2311 case AArch64::STRBui: return AArch64::STURBi; 2312 case AArch64::STRHui: return AArch64::STURHi; 2313 case AArch64::STRSui: return AArch64::STURSi; 2314 case AArch64::STRDui: return AArch64::STURDi; 2315 case AArch64::STRQui: return AArch64::STURQi; 2316 case AArch64::STRBBui: return AArch64::STURBBi; 2317 case AArch64::STRHHui: return AArch64::STURHHi; 2318 } 2319 } 2320 2321 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2322 switch (Opc) { 2323 default: 2324 return 2; 2325 case AArch64::LDPXi: 2326 case AArch64::LDPDi: 2327 case AArch64::STPXi: 2328 case AArch64::STPDi: 2329 case AArch64::LDNPXi: 2330 case AArch64::LDNPDi: 2331 case AArch64::STNPXi: 2332 case AArch64::STNPDi: 2333 case AArch64::LDPQi: 2334 case AArch64::STPQi: 2335 case AArch64::LDNPQi: 2336 case AArch64::STNPQi: 2337 case AArch64::LDPWi: 2338 case AArch64::LDPSi: 2339 case AArch64::STPWi: 2340 case AArch64::STPSi: 2341 case AArch64::LDNPWi: 2342 case AArch64::LDNPSi: 2343 case AArch64::STNPWi: 2344 case AArch64::STNPSi: 2345 case AArch64::LDG: 2346 case AArch64::STGPi: 2347 2348 case AArch64::LD1B_IMM: 2349 case AArch64::LD1B_H_IMM: 2350 case AArch64::LD1B_S_IMM: 2351 case AArch64::LD1B_D_IMM: 2352 case AArch64::LD1SB_H_IMM: 2353 case AArch64::LD1SB_S_IMM: 2354 case AArch64::LD1SB_D_IMM: 2355 case AArch64::LD1H_IMM: 2356 case AArch64::LD1H_S_IMM: 2357 case AArch64::LD1H_D_IMM: 2358 case AArch64::LD1SH_S_IMM: 2359 case AArch64::LD1SH_D_IMM: 2360 case AArch64::LD1W_IMM: 2361 case AArch64::LD1W_D_IMM: 2362 case AArch64::LD1SW_D_IMM: 2363 case AArch64::LD1D_IMM: 2364 2365 case AArch64::LD2B_IMM: 2366 case AArch64::LD2H_IMM: 2367 case AArch64::LD2W_IMM: 2368 case AArch64::LD2D_IMM: 2369 case AArch64::LD3B_IMM: 2370 case AArch64::LD3H_IMM: 2371 case AArch64::LD3W_IMM: 2372 case AArch64::LD3D_IMM: 2373 case AArch64::LD4B_IMM: 2374 case AArch64::LD4H_IMM: 2375 case AArch64::LD4W_IMM: 2376 case AArch64::LD4D_IMM: 2377 2378 case AArch64::ST1B_IMM: 2379 case AArch64::ST1B_H_IMM: 2380 case AArch64::ST1B_S_IMM: 2381 case AArch64::ST1B_D_IMM: 2382 case AArch64::ST1H_IMM: 2383 case AArch64::ST1H_S_IMM: 2384 case AArch64::ST1H_D_IMM: 2385 case AArch64::ST1W_IMM: 2386 case AArch64::ST1W_D_IMM: 2387 case AArch64::ST1D_IMM: 2388 2389 case AArch64::ST2B_IMM: 2390 case AArch64::ST2H_IMM: 2391 case AArch64::ST2W_IMM: 2392 case AArch64::ST2D_IMM: 2393 case AArch64::ST3B_IMM: 2394 case AArch64::ST3H_IMM: 2395 case AArch64::ST3W_IMM: 2396 case AArch64::ST3D_IMM: 2397 case AArch64::ST4B_IMM: 2398 case AArch64::ST4H_IMM: 2399 case AArch64::ST4W_IMM: 2400 case AArch64::ST4D_IMM: 2401 2402 case AArch64::LD1RB_IMM: 2403 case AArch64::LD1RB_H_IMM: 2404 case AArch64::LD1RB_S_IMM: 2405 case AArch64::LD1RB_D_IMM: 2406 case AArch64::LD1RSB_H_IMM: 2407 case AArch64::LD1RSB_S_IMM: 2408 case AArch64::LD1RSB_D_IMM: 2409 case AArch64::LD1RH_IMM: 2410 case AArch64::LD1RH_S_IMM: 2411 case AArch64::LD1RH_D_IMM: 2412 case AArch64::LD1RSH_S_IMM: 2413 case AArch64::LD1RSH_D_IMM: 2414 case AArch64::LD1RW_IMM: 2415 case AArch64::LD1RW_D_IMM: 2416 case AArch64::LD1RSW_IMM: 2417 case AArch64::LD1RD_IMM: 2418 2419 case AArch64::LDNT1B_ZRI: 2420 case AArch64::LDNT1H_ZRI: 2421 case AArch64::LDNT1W_ZRI: 2422 case AArch64::LDNT1D_ZRI: 2423 case AArch64::STNT1B_ZRI: 2424 case AArch64::STNT1H_ZRI: 2425 case AArch64::STNT1W_ZRI: 2426 case AArch64::STNT1D_ZRI: 2427 2428 case AArch64::LDNF1B_IMM: 2429 case AArch64::LDNF1B_H_IMM: 2430 case AArch64::LDNF1B_S_IMM: 2431 case AArch64::LDNF1B_D_IMM: 2432 case AArch64::LDNF1SB_H_IMM: 2433 case AArch64::LDNF1SB_S_IMM: 2434 case AArch64::LDNF1SB_D_IMM: 2435 case AArch64::LDNF1H_IMM: 2436 case AArch64::LDNF1H_S_IMM: 2437 case AArch64::LDNF1H_D_IMM: 2438 case AArch64::LDNF1SH_S_IMM: 2439 case AArch64::LDNF1SH_D_IMM: 2440 case AArch64::LDNF1W_IMM: 2441 case AArch64::LDNF1W_D_IMM: 2442 case AArch64::LDNF1SW_D_IMM: 2443 case AArch64::LDNF1D_IMM: 2444 return 3; 2445 case AArch64::ADDG: 2446 case AArch64::STGi: 2447 case AArch64::LDR_PXI: 2448 case AArch64::STR_PXI: 2449 return 2; 2450 } 2451 } 2452 2453 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2454 switch (MI.getOpcode()) { 2455 default: 2456 return false; 2457 // Scaled instructions. 2458 case AArch64::STRSui: 2459 case AArch64::STRDui: 2460 case AArch64::STRQui: 2461 case AArch64::STRXui: 2462 case AArch64::STRWui: 2463 case AArch64::LDRSui: 2464 case AArch64::LDRDui: 2465 case AArch64::LDRQui: 2466 case AArch64::LDRXui: 2467 case AArch64::LDRWui: 2468 case AArch64::LDRSWui: 2469 // Unscaled instructions. 2470 case AArch64::STURSi: 2471 case AArch64::STRSpre: 2472 case AArch64::STURDi: 2473 case AArch64::STRDpre: 2474 case AArch64::STURQi: 2475 case AArch64::STRQpre: 2476 case AArch64::STURWi: 2477 case AArch64::STRWpre: 2478 case AArch64::STURXi: 2479 case AArch64::STRXpre: 2480 case AArch64::LDURSi: 2481 case AArch64::LDRSpre: 2482 case AArch64::LDURDi: 2483 case AArch64::LDRDpre: 2484 case AArch64::LDURQi: 2485 case AArch64::LDRQpre: 2486 case AArch64::LDURWi: 2487 case AArch64::LDRWpre: 2488 case AArch64::LDURXi: 2489 case AArch64::LDRXpre: 2490 case AArch64::LDURSWi: 2491 case AArch64::LDRSWpre: 2492 return true; 2493 } 2494 } 2495 2496 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) { 2497 switch (MI.getOpcode()) { 2498 default: 2499 assert((!MI.isCall() || !MI.isReturn()) && 2500 "Unexpected instruction - was a new tail call opcode introduced?"); 2501 return false; 2502 case AArch64::TCRETURNdi: 2503 case AArch64::TCRETURNri: 2504 case AArch64::TCRETURNriBTI: 2505 case AArch64::TCRETURNriALL: 2506 return true; 2507 } 2508 } 2509 2510 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2511 switch (Opc) { 2512 default: 2513 llvm_unreachable("Opcode has no flag setting equivalent!"); 2514 // 32-bit cases: 2515 case AArch64::ADDWri: 2516 return AArch64::ADDSWri; 2517 case AArch64::ADDWrr: 2518 return AArch64::ADDSWrr; 2519 case AArch64::ADDWrs: 2520 return AArch64::ADDSWrs; 2521 case AArch64::ADDWrx: 2522 return AArch64::ADDSWrx; 2523 case AArch64::ANDWri: 2524 return AArch64::ANDSWri; 2525 case AArch64::ANDWrr: 2526 return AArch64::ANDSWrr; 2527 case AArch64::ANDWrs: 2528 return AArch64::ANDSWrs; 2529 case AArch64::BICWrr: 2530 return AArch64::BICSWrr; 2531 case AArch64::BICWrs: 2532 return AArch64::BICSWrs; 2533 case AArch64::SUBWri: 2534 return AArch64::SUBSWri; 2535 case AArch64::SUBWrr: 2536 return AArch64::SUBSWrr; 2537 case AArch64::SUBWrs: 2538 return AArch64::SUBSWrs; 2539 case AArch64::SUBWrx: 2540 return AArch64::SUBSWrx; 2541 // 64-bit cases: 2542 case AArch64::ADDXri: 2543 return AArch64::ADDSXri; 2544 case AArch64::ADDXrr: 2545 return AArch64::ADDSXrr; 2546 case AArch64::ADDXrs: 2547 return AArch64::ADDSXrs; 2548 case AArch64::ADDXrx: 2549 return AArch64::ADDSXrx; 2550 case AArch64::ANDXri: 2551 return AArch64::ANDSXri; 2552 case AArch64::ANDXrr: 2553 return AArch64::ANDSXrr; 2554 case AArch64::ANDXrs: 2555 return AArch64::ANDSXrs; 2556 case AArch64::BICXrr: 2557 return AArch64::BICSXrr; 2558 case AArch64::BICXrs: 2559 return AArch64::BICSXrs; 2560 case AArch64::SUBXri: 2561 return AArch64::SUBSXri; 2562 case AArch64::SUBXrr: 2563 return AArch64::SUBSXrr; 2564 case AArch64::SUBXrs: 2565 return AArch64::SUBSXrs; 2566 case AArch64::SUBXrx: 2567 return AArch64::SUBSXrx; 2568 // SVE instructions: 2569 case AArch64::AND_PPzPP: 2570 return AArch64::ANDS_PPzPP; 2571 case AArch64::BIC_PPzPP: 2572 return AArch64::BICS_PPzPP; 2573 case AArch64::EOR_PPzPP: 2574 return AArch64::EORS_PPzPP; 2575 case AArch64::NAND_PPzPP: 2576 return AArch64::NANDS_PPzPP; 2577 case AArch64::NOR_PPzPP: 2578 return AArch64::NORS_PPzPP; 2579 case AArch64::ORN_PPzPP: 2580 return AArch64::ORNS_PPzPP; 2581 case AArch64::ORR_PPzPP: 2582 return AArch64::ORRS_PPzPP; 2583 case AArch64::BRKA_PPzP: 2584 return AArch64::BRKAS_PPzP; 2585 case AArch64::BRKPA_PPzPP: 2586 return AArch64::BRKPAS_PPzPP; 2587 case AArch64::BRKB_PPzP: 2588 return AArch64::BRKBS_PPzP; 2589 case AArch64::BRKPB_PPzPP: 2590 return AArch64::BRKPBS_PPzPP; 2591 case AArch64::BRKN_PPzP: 2592 return AArch64::BRKNS_PPzP; 2593 case AArch64::RDFFR_PPz: 2594 return AArch64::RDFFRS_PPz; 2595 case AArch64::PTRUE_B: 2596 return AArch64::PTRUES_B; 2597 } 2598 } 2599 2600 // Is this a candidate for ld/st merging or pairing? For example, we don't 2601 // touch volatiles or load/stores that have a hint to avoid pair formation. 2602 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2603 2604 bool IsPreLdSt = isPreLdSt(MI); 2605 2606 // If this is a volatile load/store, don't mess with it. 2607 if (MI.hasOrderedMemoryRef()) 2608 return false; 2609 2610 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2611 // For Pre-inc LD/ST, the operand is shifted by one. 2612 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2613 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2614 "Expected a reg or frame index operand."); 2615 2616 // For Pre-indexed addressing quadword instructions, the third operand is the 2617 // immediate value. 2618 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2619 2620 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2621 return false; 2622 2623 // Can't merge/pair if the instruction modifies the base register. 2624 // e.g., ldr x0, [x0] 2625 // This case will never occur with an FI base. 2626 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or 2627 // STR<S,D,Q,W,X>pre, it can be merged. 2628 // For example: 2629 // ldr q0, [x11, #32]! 2630 // ldr q1, [x11, #16] 2631 // to 2632 // ldp q0, q1, [x11, #32]! 2633 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2634 Register BaseReg = MI.getOperand(1).getReg(); 2635 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2636 if (MI.modifiesRegister(BaseReg, TRI)) 2637 return false; 2638 } 2639 2640 // Check if this load/store has a hint to avoid pair formation. 2641 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2642 if (isLdStPairSuppressed(MI)) 2643 return false; 2644 2645 // Do not pair any callee-save store/reload instructions in the 2646 // prologue/epilogue if the CFI information encoded the operations as separate 2647 // instructions, as that will cause the size of the actual prologue to mismatch 2648 // with the prologue size recorded in the Windows CFI. 2649 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2650 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2651 MI.getMF()->getFunction().needsUnwindTableEntry(); 2652 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2653 MI.getFlag(MachineInstr::FrameDestroy))) 2654 return false; 2655 2656 // On some CPUs quad load/store pairs are slower than two single load/stores. 2657 if (Subtarget.isPaired128Slow()) { 2658 switch (MI.getOpcode()) { 2659 default: 2660 break; 2661 case AArch64::LDURQi: 2662 case AArch64::STURQi: 2663 case AArch64::LDRQui: 2664 case AArch64::STRQui: 2665 return false; 2666 } 2667 } 2668 2669 return true; 2670 } 2671 2672 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2673 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2674 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2675 const TargetRegisterInfo *TRI) const { 2676 if (!LdSt.mayLoadOrStore()) 2677 return false; 2678 2679 const MachineOperand *BaseOp; 2680 TypeSize WidthN(0, false); 2681 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2682 WidthN, TRI)) 2683 return false; 2684 // The maximum vscale is 16 under AArch64, return the maximal extent for the 2685 // vector. 2686 Width = WidthN.isScalable() 2687 ? WidthN.getKnownMinValue() * AArch64::SVEMaxBitsPerVector / 2688 AArch64::SVEBitsPerBlock 2689 : WidthN.getKnownMinValue(); 2690 BaseOps.push_back(BaseOp); 2691 return true; 2692 } 2693 2694 std::optional<ExtAddrMode> 2695 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2696 const TargetRegisterInfo *TRI) const { 2697 const MachineOperand *Base; // Filled with the base operand of MI. 2698 int64_t Offset; // Filled with the offset of MI. 2699 bool OffsetIsScalable; 2700 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2701 return std::nullopt; 2702 2703 if (!Base->isReg()) 2704 return std::nullopt; 2705 ExtAddrMode AM; 2706 AM.BaseReg = Base->getReg(); 2707 AM.Displacement = Offset; 2708 AM.ScaledReg = 0; 2709 AM.Scale = 0; 2710 return AM; 2711 } 2712 2713 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, 2714 Register Reg, 2715 const MachineInstr &AddrI, 2716 ExtAddrMode &AM) const { 2717 // Filter out instructions into which we cannot fold. 2718 unsigned NumBytes; 2719 int64_t OffsetScale = 1; 2720 switch (MemI.getOpcode()) { 2721 default: 2722 return false; 2723 2724 case AArch64::LDURQi: 2725 case AArch64::STURQi: 2726 NumBytes = 16; 2727 break; 2728 2729 case AArch64::LDURDi: 2730 case AArch64::STURDi: 2731 case AArch64::LDURXi: 2732 case AArch64::STURXi: 2733 NumBytes = 8; 2734 break; 2735 2736 case AArch64::LDURWi: 2737 case AArch64::LDURSWi: 2738 case AArch64::STURWi: 2739 NumBytes = 4; 2740 break; 2741 2742 case AArch64::LDURHi: 2743 case AArch64::STURHi: 2744 case AArch64::LDURHHi: 2745 case AArch64::STURHHi: 2746 case AArch64::LDURSHXi: 2747 case AArch64::LDURSHWi: 2748 NumBytes = 2; 2749 break; 2750 2751 case AArch64::LDRBroX: 2752 case AArch64::LDRBBroX: 2753 case AArch64::LDRSBXroX: 2754 case AArch64::LDRSBWroX: 2755 case AArch64::STRBroX: 2756 case AArch64::STRBBroX: 2757 case AArch64::LDURBi: 2758 case AArch64::LDURBBi: 2759 case AArch64::LDURSBXi: 2760 case AArch64::LDURSBWi: 2761 case AArch64::STURBi: 2762 case AArch64::STURBBi: 2763 case AArch64::LDRBui: 2764 case AArch64::LDRBBui: 2765 case AArch64::LDRSBXui: 2766 case AArch64::LDRSBWui: 2767 case AArch64::STRBui: 2768 case AArch64::STRBBui: 2769 NumBytes = 1; 2770 break; 2771 2772 case AArch64::LDRQroX: 2773 case AArch64::STRQroX: 2774 case AArch64::LDRQui: 2775 case AArch64::STRQui: 2776 NumBytes = 16; 2777 OffsetScale = 16; 2778 break; 2779 2780 case AArch64::LDRDroX: 2781 case AArch64::STRDroX: 2782 case AArch64::LDRXroX: 2783 case AArch64::STRXroX: 2784 case AArch64::LDRDui: 2785 case AArch64::STRDui: 2786 case AArch64::LDRXui: 2787 case AArch64::STRXui: 2788 NumBytes = 8; 2789 OffsetScale = 8; 2790 break; 2791 2792 case AArch64::LDRWroX: 2793 case AArch64::LDRSWroX: 2794 case AArch64::STRWroX: 2795 case AArch64::LDRWui: 2796 case AArch64::LDRSWui: 2797 case AArch64::STRWui: 2798 NumBytes = 4; 2799 OffsetScale = 4; 2800 break; 2801 2802 case AArch64::LDRHroX: 2803 case AArch64::STRHroX: 2804 case AArch64::LDRHHroX: 2805 case AArch64::STRHHroX: 2806 case AArch64::LDRSHXroX: 2807 case AArch64::LDRSHWroX: 2808 case AArch64::LDRHui: 2809 case AArch64::STRHui: 2810 case AArch64::LDRHHui: 2811 case AArch64::STRHHui: 2812 case AArch64::LDRSHXui: 2813 case AArch64::LDRSHWui: 2814 NumBytes = 2; 2815 OffsetScale = 2; 2816 break; 2817 } 2818 2819 // Check the fold operand is not the loaded/stored value. 2820 const MachineOperand &BaseRegOp = MemI.getOperand(0); 2821 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg) 2822 return false; 2823 2824 // Handle memory instructions with a [Reg, Reg] addressing mode. 2825 if (MemI.getOperand(2).isReg()) { 2826 // Bail if the addressing mode already includes extension of the offset 2827 // register. 2828 if (MemI.getOperand(3).getImm()) 2829 return false; 2830 2831 // Check if we actually have a scaled offset. 2832 if (MemI.getOperand(4).getImm() == 0) 2833 OffsetScale = 1; 2834 2835 // If the address instructions is folded into the base register, then the 2836 // addressing mode must not have a scale. Then we can swap the base and the 2837 // scaled registers. 2838 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1) 2839 return false; 2840 2841 switch (AddrI.getOpcode()) { 2842 default: 2843 return false; 2844 2845 case AArch64::SBFMXri: 2846 // sxtw Xa, Wm 2847 // ldr Xd, [Xn, Xa, lsl #N] 2848 // -> 2849 // ldr Xd, [Xn, Wm, sxtw #N] 2850 if (AddrI.getOperand(2).getImm() != 0 || 2851 AddrI.getOperand(3).getImm() != 31) 2852 return false; 2853 2854 AM.BaseReg = MemI.getOperand(1).getReg(); 2855 if (AM.BaseReg == Reg) 2856 AM.BaseReg = MemI.getOperand(2).getReg(); 2857 AM.ScaledReg = AddrI.getOperand(1).getReg(); 2858 AM.Scale = OffsetScale; 2859 AM.Displacement = 0; 2860 AM.Form = ExtAddrMode::Formula::SExtScaledReg; 2861 return true; 2862 2863 case TargetOpcode::SUBREG_TO_REG: { 2864 // mov Wa, Wm 2865 // ldr Xd, [Xn, Xa, lsl #N] 2866 // -> 2867 // ldr Xd, [Xn, Wm, uxtw #N] 2868 2869 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG. 2870 if (AddrI.getOperand(1).getImm() != 0 || 2871 AddrI.getOperand(3).getImm() != AArch64::sub_32) 2872 return false; 2873 2874 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo(); 2875 Register OffsetReg = AddrI.getOperand(2).getReg(); 2876 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg)) 2877 return false; 2878 2879 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg); 2880 if (DefMI.getOpcode() != AArch64::ORRWrs || 2881 DefMI.getOperand(1).getReg() != AArch64::WZR || 2882 DefMI.getOperand(3).getImm() != 0) 2883 return false; 2884 2885 AM.BaseReg = MemI.getOperand(1).getReg(); 2886 if (AM.BaseReg == Reg) 2887 AM.BaseReg = MemI.getOperand(2).getReg(); 2888 AM.ScaledReg = DefMI.getOperand(2).getReg(); 2889 AM.Scale = OffsetScale; 2890 AM.Displacement = 0; 2891 AM.Form = ExtAddrMode::Formula::ZExtScaledReg; 2892 return true; 2893 } 2894 } 2895 } 2896 2897 // Handle memory instructions with a [Reg, #Imm] addressing mode. 2898 2899 // Check we are not breaking a potential conversion to an LDP. 2900 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset, 2901 int64_t NewOffset) -> bool { 2902 int64_t MinOffset, MaxOffset; 2903 switch (NumBytes) { 2904 default: 2905 return true; 2906 case 4: 2907 MinOffset = -256; 2908 MaxOffset = 252; 2909 break; 2910 case 8: 2911 MinOffset = -512; 2912 MaxOffset = 504; 2913 break; 2914 case 16: 2915 MinOffset = -1024; 2916 MaxOffset = 1008; 2917 break; 2918 } 2919 return OldOffset < MinOffset || OldOffset > MaxOffset || 2920 (NewOffset >= MinOffset && NewOffset <= MaxOffset); 2921 }; 2922 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool { 2923 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale; 2924 int64_t NewOffset = OldOffset + Disp; 2925 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0)) 2926 return false; 2927 // If the old offset would fit into an LDP, but the new offset wouldn't, 2928 // bail out. 2929 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset)) 2930 return false; 2931 AM.BaseReg = AddrI.getOperand(1).getReg(); 2932 AM.ScaledReg = 0; 2933 AM.Scale = 0; 2934 AM.Displacement = NewOffset; 2935 AM.Form = ExtAddrMode::Formula::Basic; 2936 return true; 2937 }; 2938 2939 auto canFoldAddRegIntoAddrMode = 2940 [&](int64_t Scale, 2941 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool { 2942 if (MemI.getOperand(2).getImm() != 0) 2943 return false; 2944 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale)) 2945 return false; 2946 AM.BaseReg = AddrI.getOperand(1).getReg(); 2947 AM.ScaledReg = AddrI.getOperand(2).getReg(); 2948 AM.Scale = Scale; 2949 AM.Displacement = 0; 2950 AM.Form = Form; 2951 return true; 2952 }; 2953 2954 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) { 2955 unsigned Opcode = MemI.getOpcode(); 2956 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) && 2957 Subtarget.isSTRQroSlow(); 2958 }; 2959 2960 int64_t Disp = 0; 2961 const bool OptSize = MemI.getMF()->getFunction().hasOptSize(); 2962 switch (AddrI.getOpcode()) { 2963 default: 2964 return false; 2965 2966 case AArch64::ADDXri: 2967 // add Xa, Xn, #N 2968 // ldr Xd, [Xa, #M] 2969 // -> 2970 // ldr Xd, [Xn, #N'+M] 2971 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 2972 return canFoldAddSubImmIntoAddrMode(Disp); 2973 2974 case AArch64::SUBXri: 2975 // sub Xa, Xn, #N 2976 // ldr Xd, [Xa, #M] 2977 // -> 2978 // ldr Xd, [Xn, #N'+M] 2979 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 2980 return canFoldAddSubImmIntoAddrMode(-Disp); 2981 2982 case AArch64::ADDXrs: { 2983 // add Xa, Xn, Xm, lsl #N 2984 // ldr Xd, [Xa] 2985 // -> 2986 // ldr Xd, [Xn, Xm, lsl #N] 2987 2988 // Don't fold the add if the result would be slower, unless optimising for 2989 // size. 2990 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 2991 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL) 2992 return false; 2993 Shift = AArch64_AM::getShiftValue(Shift); 2994 if (!OptSize) { 2995 if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast()) 2996 return false; 2997 if (avoidSlowSTRQ(MemI)) 2998 return false; 2999 } 3000 return canFoldAddRegIntoAddrMode(1ULL << Shift); 3001 } 3002 3003 case AArch64::ADDXrr: 3004 // add Xa, Xn, Xm 3005 // ldr Xd, [Xa] 3006 // -> 3007 // ldr Xd, [Xn, Xm, lsl #0] 3008 3009 // Don't fold the add if the result would be slower, unless optimising for 3010 // size. 3011 if (!OptSize && avoidSlowSTRQ(MemI)) 3012 return false; 3013 return canFoldAddRegIntoAddrMode(1); 3014 3015 case AArch64::ADDXrx: 3016 // add Xa, Xn, Wm, {s,u}xtw #N 3017 // ldr Xd, [Xa] 3018 // -> 3019 // ldr Xd, [Xn, Wm, {s,u}xtw #N] 3020 3021 // Don't fold the add if the result would be slower, unless optimising for 3022 // size. 3023 if (!OptSize && avoidSlowSTRQ(MemI)) 3024 return false; 3025 3026 // Can fold only sign-/zero-extend of a word. 3027 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3028 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm); 3029 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW) 3030 return false; 3031 3032 return canFoldAddRegIntoAddrMode( 3033 1ULL << AArch64_AM::getArithShiftValue(Imm), 3034 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg 3035 : ExtAddrMode::Formula::ZExtScaledReg); 3036 } 3037 } 3038 3039 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, 3040 // return the opcode of an instruction performing the same operation, but using 3041 // the [Reg, Reg] addressing mode. 3042 static unsigned regOffsetOpcode(unsigned Opcode) { 3043 switch (Opcode) { 3044 default: 3045 llvm_unreachable("Address folding not implemented for instruction"); 3046 3047 case AArch64::LDURQi: 3048 case AArch64::LDRQui: 3049 return AArch64::LDRQroX; 3050 case AArch64::STURQi: 3051 case AArch64::STRQui: 3052 return AArch64::STRQroX; 3053 case AArch64::LDURDi: 3054 case AArch64::LDRDui: 3055 return AArch64::LDRDroX; 3056 case AArch64::STURDi: 3057 case AArch64::STRDui: 3058 return AArch64::STRDroX; 3059 case AArch64::LDURXi: 3060 case AArch64::LDRXui: 3061 return AArch64::LDRXroX; 3062 case AArch64::STURXi: 3063 case AArch64::STRXui: 3064 return AArch64::STRXroX; 3065 case AArch64::LDURWi: 3066 case AArch64::LDRWui: 3067 return AArch64::LDRWroX; 3068 case AArch64::LDURSWi: 3069 case AArch64::LDRSWui: 3070 return AArch64::LDRSWroX; 3071 case AArch64::STURWi: 3072 case AArch64::STRWui: 3073 return AArch64::STRWroX; 3074 case AArch64::LDURHi: 3075 case AArch64::LDRHui: 3076 return AArch64::LDRHroX; 3077 case AArch64::STURHi: 3078 case AArch64::STRHui: 3079 return AArch64::STRHroX; 3080 case AArch64::LDURHHi: 3081 case AArch64::LDRHHui: 3082 return AArch64::LDRHHroX; 3083 case AArch64::STURHHi: 3084 case AArch64::STRHHui: 3085 return AArch64::STRHHroX; 3086 case AArch64::LDURSHXi: 3087 case AArch64::LDRSHXui: 3088 return AArch64::LDRSHXroX; 3089 case AArch64::LDURSHWi: 3090 case AArch64::LDRSHWui: 3091 return AArch64::LDRSHWroX; 3092 case AArch64::LDURBi: 3093 case AArch64::LDRBui: 3094 return AArch64::LDRBroX; 3095 case AArch64::LDURBBi: 3096 case AArch64::LDRBBui: 3097 return AArch64::LDRBBroX; 3098 case AArch64::LDURSBXi: 3099 case AArch64::LDRSBXui: 3100 return AArch64::LDRSBXroX; 3101 case AArch64::LDURSBWi: 3102 case AArch64::LDRSBWui: 3103 return AArch64::LDRSBWroX; 3104 case AArch64::STURBi: 3105 case AArch64::STRBui: 3106 return AArch64::STRBroX; 3107 case AArch64::STURBBi: 3108 case AArch64::STRBBui: 3109 return AArch64::STRBBroX; 3110 } 3111 } 3112 3113 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3114 // the opcode of an instruction performing the same operation, but using the 3115 // [Reg, #Imm] addressing mode with scaled offset. 3116 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) { 3117 switch (Opcode) { 3118 default: 3119 llvm_unreachable("Address folding not implemented for instruction"); 3120 3121 case AArch64::LDURQi: 3122 Scale = 16; 3123 return AArch64::LDRQui; 3124 case AArch64::STURQi: 3125 Scale = 16; 3126 return AArch64::STRQui; 3127 case AArch64::LDURDi: 3128 Scale = 8; 3129 return AArch64::LDRDui; 3130 case AArch64::STURDi: 3131 Scale = 8; 3132 return AArch64::STRDui; 3133 case AArch64::LDURXi: 3134 Scale = 8; 3135 return AArch64::LDRXui; 3136 case AArch64::STURXi: 3137 Scale = 8; 3138 return AArch64::STRXui; 3139 case AArch64::LDURWi: 3140 Scale = 4; 3141 return AArch64::LDRWui; 3142 case AArch64::LDURSWi: 3143 Scale = 4; 3144 return AArch64::LDRSWui; 3145 case AArch64::STURWi: 3146 Scale = 4; 3147 return AArch64::STRWui; 3148 case AArch64::LDURHi: 3149 Scale = 2; 3150 return AArch64::LDRHui; 3151 case AArch64::STURHi: 3152 Scale = 2; 3153 return AArch64::STRHui; 3154 case AArch64::LDURHHi: 3155 Scale = 2; 3156 return AArch64::LDRHHui; 3157 case AArch64::STURHHi: 3158 Scale = 2; 3159 return AArch64::STRHHui; 3160 case AArch64::LDURSHXi: 3161 Scale = 2; 3162 return AArch64::LDRSHXui; 3163 case AArch64::LDURSHWi: 3164 Scale = 2; 3165 return AArch64::LDRSHWui; 3166 case AArch64::LDURBi: 3167 Scale = 1; 3168 return AArch64::LDRBui; 3169 case AArch64::LDURBBi: 3170 Scale = 1; 3171 return AArch64::LDRBBui; 3172 case AArch64::LDURSBXi: 3173 Scale = 1; 3174 return AArch64::LDRSBXui; 3175 case AArch64::LDURSBWi: 3176 Scale = 1; 3177 return AArch64::LDRSBWui; 3178 case AArch64::STURBi: 3179 Scale = 1; 3180 return AArch64::STRBui; 3181 case AArch64::STURBBi: 3182 Scale = 1; 3183 return AArch64::STRBBui; 3184 case AArch64::LDRQui: 3185 case AArch64::STRQui: 3186 Scale = 16; 3187 return Opcode; 3188 case AArch64::LDRDui: 3189 case AArch64::STRDui: 3190 case AArch64::LDRXui: 3191 case AArch64::STRXui: 3192 Scale = 8; 3193 return Opcode; 3194 case AArch64::LDRWui: 3195 case AArch64::LDRSWui: 3196 case AArch64::STRWui: 3197 Scale = 4; 3198 return Opcode; 3199 case AArch64::LDRHui: 3200 case AArch64::STRHui: 3201 case AArch64::LDRHHui: 3202 case AArch64::STRHHui: 3203 case AArch64::LDRSHXui: 3204 case AArch64::LDRSHWui: 3205 Scale = 2; 3206 return Opcode; 3207 case AArch64::LDRBui: 3208 case AArch64::LDRBBui: 3209 case AArch64::LDRSBXui: 3210 case AArch64::LDRSBWui: 3211 case AArch64::STRBui: 3212 case AArch64::STRBBui: 3213 Scale = 1; 3214 return Opcode; 3215 } 3216 } 3217 3218 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3219 // the opcode of an instruction performing the same operation, but using the 3220 // [Reg, #Imm] addressing mode with unscaled offset. 3221 unsigned unscaledOffsetOpcode(unsigned Opcode) { 3222 switch (Opcode) { 3223 default: 3224 llvm_unreachable("Address folding not implemented for instruction"); 3225 3226 case AArch64::LDURQi: 3227 case AArch64::STURQi: 3228 case AArch64::LDURDi: 3229 case AArch64::STURDi: 3230 case AArch64::LDURXi: 3231 case AArch64::STURXi: 3232 case AArch64::LDURWi: 3233 case AArch64::LDURSWi: 3234 case AArch64::STURWi: 3235 case AArch64::LDURHi: 3236 case AArch64::STURHi: 3237 case AArch64::LDURHHi: 3238 case AArch64::STURHHi: 3239 case AArch64::LDURSHXi: 3240 case AArch64::LDURSHWi: 3241 case AArch64::LDURBi: 3242 case AArch64::STURBi: 3243 case AArch64::LDURBBi: 3244 case AArch64::STURBBi: 3245 case AArch64::LDURSBWi: 3246 case AArch64::LDURSBXi: 3247 return Opcode; 3248 case AArch64::LDRQui: 3249 return AArch64::LDURQi; 3250 case AArch64::STRQui: 3251 return AArch64::STURQi; 3252 case AArch64::LDRDui: 3253 return AArch64::LDURDi; 3254 case AArch64::STRDui: 3255 return AArch64::STURDi; 3256 case AArch64::LDRXui: 3257 return AArch64::LDURXi; 3258 case AArch64::STRXui: 3259 return AArch64::STURXi; 3260 case AArch64::LDRWui: 3261 return AArch64::LDURWi; 3262 case AArch64::LDRSWui: 3263 return AArch64::LDURSWi; 3264 case AArch64::STRWui: 3265 return AArch64::STURWi; 3266 case AArch64::LDRHui: 3267 return AArch64::LDURHi; 3268 case AArch64::STRHui: 3269 return AArch64::STURHi; 3270 case AArch64::LDRHHui: 3271 return AArch64::LDURHHi; 3272 case AArch64::STRHHui: 3273 return AArch64::STURHHi; 3274 case AArch64::LDRSHXui: 3275 return AArch64::LDURSHXi; 3276 case AArch64::LDRSHWui: 3277 return AArch64::LDURSHWi; 3278 case AArch64::LDRBBui: 3279 return AArch64::LDURBBi; 3280 case AArch64::LDRBui: 3281 return AArch64::LDURBi; 3282 case AArch64::STRBBui: 3283 return AArch64::STURBBi; 3284 case AArch64::STRBui: 3285 return AArch64::STURBi; 3286 case AArch64::LDRSBWui: 3287 return AArch64::LDURSBWi; 3288 case AArch64::LDRSBXui: 3289 return AArch64::LDURSBXi; 3290 } 3291 } 3292 3293 // Given the opcode of a memory load/store instruction, return the opcode of an 3294 // instruction performing the same operation, but using 3295 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the 3296 // offset register. 3297 static unsigned offsetExtendOpcode(unsigned Opcode) { 3298 switch (Opcode) { 3299 default: 3300 llvm_unreachable("Address folding not implemented for instruction"); 3301 3302 case AArch64::LDRQroX: 3303 case AArch64::LDURQi: 3304 case AArch64::LDRQui: 3305 return AArch64::LDRQroW; 3306 case AArch64::STRQroX: 3307 case AArch64::STURQi: 3308 case AArch64::STRQui: 3309 return AArch64::STRQroW; 3310 case AArch64::LDRDroX: 3311 case AArch64::LDURDi: 3312 case AArch64::LDRDui: 3313 return AArch64::LDRDroW; 3314 case AArch64::STRDroX: 3315 case AArch64::STURDi: 3316 case AArch64::STRDui: 3317 return AArch64::STRDroW; 3318 case AArch64::LDRXroX: 3319 case AArch64::LDURXi: 3320 case AArch64::LDRXui: 3321 return AArch64::LDRXroW; 3322 case AArch64::STRXroX: 3323 case AArch64::STURXi: 3324 case AArch64::STRXui: 3325 return AArch64::STRXroW; 3326 case AArch64::LDRWroX: 3327 case AArch64::LDURWi: 3328 case AArch64::LDRWui: 3329 return AArch64::LDRWroW; 3330 case AArch64::LDRSWroX: 3331 case AArch64::LDURSWi: 3332 case AArch64::LDRSWui: 3333 return AArch64::LDRSWroW; 3334 case AArch64::STRWroX: 3335 case AArch64::STURWi: 3336 case AArch64::STRWui: 3337 return AArch64::STRWroW; 3338 case AArch64::LDRHroX: 3339 case AArch64::LDURHi: 3340 case AArch64::LDRHui: 3341 return AArch64::LDRHroW; 3342 case AArch64::STRHroX: 3343 case AArch64::STURHi: 3344 case AArch64::STRHui: 3345 return AArch64::STRHroW; 3346 case AArch64::LDRHHroX: 3347 case AArch64::LDURHHi: 3348 case AArch64::LDRHHui: 3349 return AArch64::LDRHHroW; 3350 case AArch64::STRHHroX: 3351 case AArch64::STURHHi: 3352 case AArch64::STRHHui: 3353 return AArch64::STRHHroW; 3354 case AArch64::LDRSHXroX: 3355 case AArch64::LDURSHXi: 3356 case AArch64::LDRSHXui: 3357 return AArch64::LDRSHXroW; 3358 case AArch64::LDRSHWroX: 3359 case AArch64::LDURSHWi: 3360 case AArch64::LDRSHWui: 3361 return AArch64::LDRSHWroW; 3362 case AArch64::LDRBroX: 3363 case AArch64::LDURBi: 3364 case AArch64::LDRBui: 3365 return AArch64::LDRBroW; 3366 case AArch64::LDRBBroX: 3367 case AArch64::LDURBBi: 3368 case AArch64::LDRBBui: 3369 return AArch64::LDRBBroW; 3370 case AArch64::LDRSBXroX: 3371 case AArch64::LDURSBXi: 3372 case AArch64::LDRSBXui: 3373 return AArch64::LDRSBXroW; 3374 case AArch64::LDRSBWroX: 3375 case AArch64::LDURSBWi: 3376 case AArch64::LDRSBWui: 3377 return AArch64::LDRSBWroW; 3378 case AArch64::STRBroX: 3379 case AArch64::STURBi: 3380 case AArch64::STRBui: 3381 return AArch64::STRBroW; 3382 case AArch64::STRBBroX: 3383 case AArch64::STURBBi: 3384 case AArch64::STRBBui: 3385 return AArch64::STRBBroW; 3386 } 3387 } 3388 3389 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, 3390 const ExtAddrMode &AM) const { 3391 3392 const DebugLoc &DL = MemI.getDebugLoc(); 3393 MachineBasicBlock &MBB = *MemI.getParent(); 3394 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo(); 3395 3396 if (AM.Form == ExtAddrMode::Formula::Basic) { 3397 if (AM.ScaledReg) { 3398 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`. 3399 unsigned Opcode = regOffsetOpcode(MemI.getOpcode()); 3400 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3401 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3402 .addReg(MemI.getOperand(0).getReg(), 3403 MemI.mayLoad() ? RegState::Define : 0) 3404 .addReg(AM.BaseReg) 3405 .addReg(AM.ScaledReg) 3406 .addImm(0) 3407 .addImm(AM.Scale > 1) 3408 .setMemRefs(MemI.memoperands()) 3409 .setMIFlags(MemI.getFlags()); 3410 return B.getInstr(); 3411 } 3412 3413 assert(AM.ScaledReg == 0 && AM.Scale == 0 && 3414 "Addressing mode not supported for folding"); 3415 3416 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`. 3417 unsigned Scale = 1; 3418 unsigned Opcode = MemI.getOpcode(); 3419 if (isInt<9>(AM.Displacement)) 3420 Opcode = unscaledOffsetOpcode(Opcode); 3421 else 3422 Opcode = scaledOffsetOpcode(Opcode, Scale); 3423 3424 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3425 .addReg(MemI.getOperand(0).getReg(), 3426 MemI.mayLoad() ? RegState::Define : 0) 3427 .addReg(AM.BaseReg) 3428 .addImm(AM.Displacement / Scale) 3429 .setMemRefs(MemI.memoperands()) 3430 .setMIFlags(MemI.getFlags()); 3431 return B.getInstr(); 3432 } 3433 3434 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg || 3435 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) { 3436 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`. 3437 assert(AM.ScaledReg && !AM.Displacement && 3438 "Address offset can be a register or an immediate, but not both"); 3439 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode()); 3440 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3441 // Make sure the offset register is in the correct register class. 3442 Register OffsetReg = AM.ScaledReg; 3443 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg); 3444 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) { 3445 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3446 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg) 3447 .addReg(AM.ScaledReg, 0, AArch64::sub_32); 3448 } 3449 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3450 .addReg(MemI.getOperand(0).getReg(), 3451 MemI.mayLoad() ? RegState::Define : 0) 3452 .addReg(AM.BaseReg) 3453 .addReg(OffsetReg) 3454 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg) 3455 .addImm(AM.Scale != 1) 3456 .setMemRefs(MemI.memoperands()) 3457 .setMIFlags(MemI.getFlags()); 3458 3459 return B.getInstr(); 3460 } 3461 3462 llvm_unreachable( 3463 "Function must not be called with an addressing mode it can't handle"); 3464 } 3465 3466 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 3467 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 3468 bool &OffsetIsScalable, TypeSize &Width, 3469 const TargetRegisterInfo *TRI) const { 3470 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3471 // Handle only loads/stores with base register followed by immediate offset. 3472 if (LdSt.getNumExplicitOperands() == 3) { 3473 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 3474 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 3475 !LdSt.getOperand(2).isImm()) 3476 return false; 3477 } else if (LdSt.getNumExplicitOperands() == 4) { 3478 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 3479 if (!LdSt.getOperand(1).isReg() || 3480 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 3481 !LdSt.getOperand(3).isImm()) 3482 return false; 3483 } else 3484 return false; 3485 3486 // Get the scaling factor for the instruction and set the width for the 3487 // instruction. 3488 TypeSize Scale(0U, false); 3489 int64_t Dummy1, Dummy2; 3490 3491 // If this returns false, then it's an instruction we don't want to handle. 3492 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 3493 return false; 3494 3495 // Compute the offset. Offset is calculated as the immediate operand 3496 // multiplied by the scaling factor. Unscaled instructions have scaling factor 3497 // set to 1. 3498 if (LdSt.getNumExplicitOperands() == 3) { 3499 BaseOp = &LdSt.getOperand(1); 3500 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 3501 } else { 3502 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 3503 BaseOp = &LdSt.getOperand(2); 3504 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 3505 } 3506 OffsetIsScalable = Scale.isScalable(); 3507 3508 if (!BaseOp->isReg() && !BaseOp->isFI()) 3509 return false; 3510 3511 return true; 3512 } 3513 3514 MachineOperand & 3515 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 3516 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3517 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 3518 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 3519 return OfsOp; 3520 } 3521 3522 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 3523 TypeSize &Width, int64_t &MinOffset, 3524 int64_t &MaxOffset) { 3525 switch (Opcode) { 3526 // Not a memory operation or something we want to handle. 3527 default: 3528 Scale = TypeSize::getFixed(0); 3529 Width = TypeSize::getFixed(0); 3530 MinOffset = MaxOffset = 0; 3531 return false; 3532 case AArch64::STRWpost: 3533 case AArch64::LDRWpost: 3534 Width = TypeSize::getFixed(32); 3535 Scale = TypeSize::getFixed(4); 3536 MinOffset = -256; 3537 MaxOffset = 255; 3538 break; 3539 case AArch64::LDURQi: 3540 case AArch64::STURQi: 3541 Width = TypeSize::getFixed(16); 3542 Scale = TypeSize::getFixed(1); 3543 MinOffset = -256; 3544 MaxOffset = 255; 3545 break; 3546 case AArch64::PRFUMi: 3547 case AArch64::LDURXi: 3548 case AArch64::LDURDi: 3549 case AArch64::LDAPURXi: 3550 case AArch64::STURXi: 3551 case AArch64::STURDi: 3552 case AArch64::STLURXi: 3553 Width = TypeSize::getFixed(8); 3554 Scale = TypeSize::getFixed(1); 3555 MinOffset = -256; 3556 MaxOffset = 255; 3557 break; 3558 case AArch64::LDURWi: 3559 case AArch64::LDURSi: 3560 case AArch64::LDURSWi: 3561 case AArch64::LDAPURi: 3562 case AArch64::LDAPURSWi: 3563 case AArch64::STURWi: 3564 case AArch64::STURSi: 3565 case AArch64::STLURWi: 3566 Width = TypeSize::getFixed(4); 3567 Scale = TypeSize::getFixed(1); 3568 MinOffset = -256; 3569 MaxOffset = 255; 3570 break; 3571 case AArch64::LDURHi: 3572 case AArch64::LDURHHi: 3573 case AArch64::LDURSHXi: 3574 case AArch64::LDURSHWi: 3575 case AArch64::LDAPURHi: 3576 case AArch64::LDAPURSHWi: 3577 case AArch64::LDAPURSHXi: 3578 case AArch64::STURHi: 3579 case AArch64::STURHHi: 3580 case AArch64::STLURHi: 3581 Width = TypeSize::getFixed(2); 3582 Scale = TypeSize::getFixed(1); 3583 MinOffset = -256; 3584 MaxOffset = 255; 3585 break; 3586 case AArch64::LDURBi: 3587 case AArch64::LDURBBi: 3588 case AArch64::LDURSBXi: 3589 case AArch64::LDURSBWi: 3590 case AArch64::LDAPURBi: 3591 case AArch64::LDAPURSBWi: 3592 case AArch64::LDAPURSBXi: 3593 case AArch64::STURBi: 3594 case AArch64::STURBBi: 3595 case AArch64::STLURBi: 3596 Width = TypeSize::getFixed(1); 3597 Scale = TypeSize::getFixed(1); 3598 MinOffset = -256; 3599 MaxOffset = 255; 3600 break; 3601 case AArch64::LDPQi: 3602 case AArch64::LDNPQi: 3603 case AArch64::STPQi: 3604 case AArch64::STNPQi: 3605 Scale = TypeSize::getFixed(16); 3606 Width = TypeSize::getFixed(32); 3607 MinOffset = -64; 3608 MaxOffset = 63; 3609 break; 3610 case AArch64::LDRQui: 3611 case AArch64::STRQui: 3612 Scale = TypeSize::getFixed(16); 3613 Width = TypeSize::getFixed(16); 3614 MinOffset = 0; 3615 MaxOffset = 4095; 3616 break; 3617 case AArch64::LDPXi: 3618 case AArch64::LDPDi: 3619 case AArch64::LDNPXi: 3620 case AArch64::LDNPDi: 3621 case AArch64::STPXi: 3622 case AArch64::STPDi: 3623 case AArch64::STNPXi: 3624 case AArch64::STNPDi: 3625 Scale = TypeSize::getFixed(8); 3626 Width = TypeSize::getFixed(16); 3627 MinOffset = -64; 3628 MaxOffset = 63; 3629 break; 3630 case AArch64::PRFMui: 3631 case AArch64::LDRXui: 3632 case AArch64::LDRDui: 3633 case AArch64::STRXui: 3634 case AArch64::STRDui: 3635 Scale = TypeSize::getFixed(8); 3636 Width = TypeSize::getFixed(8); 3637 MinOffset = 0; 3638 MaxOffset = 4095; 3639 break; 3640 case AArch64::StoreSwiftAsyncContext: 3641 // Store is an STRXui, but there might be an ADDXri in the expansion too. 3642 Scale = TypeSize::getFixed(1); 3643 Width = TypeSize::getFixed(8); 3644 MinOffset = 0; 3645 MaxOffset = 4095; 3646 break; 3647 case AArch64::LDPWi: 3648 case AArch64::LDPSi: 3649 case AArch64::LDNPWi: 3650 case AArch64::LDNPSi: 3651 case AArch64::STPWi: 3652 case AArch64::STPSi: 3653 case AArch64::STNPWi: 3654 case AArch64::STNPSi: 3655 Scale = TypeSize::getFixed(4); 3656 Width = TypeSize::getFixed(8); 3657 MinOffset = -64; 3658 MaxOffset = 63; 3659 break; 3660 case AArch64::LDRWui: 3661 case AArch64::LDRSui: 3662 case AArch64::LDRSWui: 3663 case AArch64::STRWui: 3664 case AArch64::STRSui: 3665 Scale = TypeSize::getFixed(4); 3666 Width = TypeSize::getFixed(4); 3667 MinOffset = 0; 3668 MaxOffset = 4095; 3669 break; 3670 case AArch64::LDRHui: 3671 case AArch64::LDRHHui: 3672 case AArch64::LDRSHWui: 3673 case AArch64::LDRSHXui: 3674 case AArch64::STRHui: 3675 case AArch64::STRHHui: 3676 Scale = TypeSize::getFixed(2); 3677 Width = TypeSize::getFixed(2); 3678 MinOffset = 0; 3679 MaxOffset = 4095; 3680 break; 3681 case AArch64::LDRBui: 3682 case AArch64::LDRBBui: 3683 case AArch64::LDRSBWui: 3684 case AArch64::LDRSBXui: 3685 case AArch64::STRBui: 3686 case AArch64::STRBBui: 3687 Scale = TypeSize::getFixed(1); 3688 Width = TypeSize::getFixed(1); 3689 MinOffset = 0; 3690 MaxOffset = 4095; 3691 break; 3692 case AArch64::STPXpre: 3693 case AArch64::LDPXpost: 3694 case AArch64::STPDpre: 3695 case AArch64::LDPDpost: 3696 Scale = TypeSize::getFixed(8); 3697 Width = TypeSize::getFixed(8); 3698 MinOffset = -512; 3699 MaxOffset = 504; 3700 break; 3701 case AArch64::STPQpre: 3702 case AArch64::LDPQpost: 3703 Scale = TypeSize::getFixed(16); 3704 Width = TypeSize::getFixed(16); 3705 MinOffset = -1024; 3706 MaxOffset = 1008; 3707 break; 3708 case AArch64::STRXpre: 3709 case AArch64::STRDpre: 3710 case AArch64::LDRXpost: 3711 case AArch64::LDRDpost: 3712 Scale = TypeSize::getFixed(1); 3713 Width = TypeSize::getFixed(8); 3714 MinOffset = -256; 3715 MaxOffset = 255; 3716 break; 3717 case AArch64::STRQpre: 3718 case AArch64::LDRQpost: 3719 Scale = TypeSize::getFixed(1); 3720 Width = TypeSize::getFixed(16); 3721 MinOffset = -256; 3722 MaxOffset = 255; 3723 break; 3724 case AArch64::ADDG: 3725 Scale = TypeSize::getFixed(16); 3726 Width = TypeSize::getFixed(0); 3727 MinOffset = 0; 3728 MaxOffset = 63; 3729 break; 3730 case AArch64::TAGPstack: 3731 Scale = TypeSize::getFixed(16); 3732 Width = TypeSize::getFixed(0); 3733 // TAGP with a negative offset turns into SUBP, which has a maximum offset 3734 // of 63 (not 64!). 3735 MinOffset = -63; 3736 MaxOffset = 63; 3737 break; 3738 case AArch64::LDG: 3739 case AArch64::STGi: 3740 case AArch64::STZGi: 3741 Scale = TypeSize::getFixed(16); 3742 Width = TypeSize::getFixed(16); 3743 MinOffset = -256; 3744 MaxOffset = 255; 3745 break; 3746 case AArch64::STR_ZZZZXI: 3747 case AArch64::LDR_ZZZZXI: 3748 Scale = TypeSize::getScalable(16); 3749 Width = TypeSize::getScalable(16 * 4); 3750 MinOffset = -256; 3751 MaxOffset = 252; 3752 break; 3753 case AArch64::STR_ZZZXI: 3754 case AArch64::LDR_ZZZXI: 3755 Scale = TypeSize::getScalable(16); 3756 Width = TypeSize::getScalable(16 * 3); 3757 MinOffset = -256; 3758 MaxOffset = 253; 3759 break; 3760 case AArch64::STR_ZZXI: 3761 case AArch64::LDR_ZZXI: 3762 Scale = TypeSize::getScalable(16); 3763 Width = TypeSize::getScalable(16 * 2); 3764 MinOffset = -256; 3765 MaxOffset = 254; 3766 break; 3767 case AArch64::LDR_PXI: 3768 case AArch64::STR_PXI: 3769 Scale = TypeSize::getScalable(2); 3770 Width = TypeSize::getScalable(2); 3771 MinOffset = -256; 3772 MaxOffset = 255; 3773 break; 3774 case AArch64::LDR_PPXI: 3775 case AArch64::STR_PPXI: 3776 Scale = TypeSize::getScalable(2); 3777 Width = TypeSize::getScalable(2 * 2); 3778 MinOffset = -256; 3779 MaxOffset = 254; 3780 break; 3781 case AArch64::LDR_ZXI: 3782 case AArch64::STR_ZXI: 3783 Scale = TypeSize::getScalable(16); 3784 Width = TypeSize::getScalable(16); 3785 MinOffset = -256; 3786 MaxOffset = 255; 3787 break; 3788 case AArch64::LD1B_IMM: 3789 case AArch64::LD1H_IMM: 3790 case AArch64::LD1W_IMM: 3791 case AArch64::LD1D_IMM: 3792 case AArch64::LDNT1B_ZRI: 3793 case AArch64::LDNT1H_ZRI: 3794 case AArch64::LDNT1W_ZRI: 3795 case AArch64::LDNT1D_ZRI: 3796 case AArch64::ST1B_IMM: 3797 case AArch64::ST1H_IMM: 3798 case AArch64::ST1W_IMM: 3799 case AArch64::ST1D_IMM: 3800 case AArch64::STNT1B_ZRI: 3801 case AArch64::STNT1H_ZRI: 3802 case AArch64::STNT1W_ZRI: 3803 case AArch64::STNT1D_ZRI: 3804 case AArch64::LDNF1B_IMM: 3805 case AArch64::LDNF1H_IMM: 3806 case AArch64::LDNF1W_IMM: 3807 case AArch64::LDNF1D_IMM: 3808 // A full vectors worth of data 3809 // Width = mbytes * elements 3810 Scale = TypeSize::getScalable(16); 3811 Width = TypeSize::getScalable(16); 3812 MinOffset = -8; 3813 MaxOffset = 7; 3814 break; 3815 case AArch64::LD2B_IMM: 3816 case AArch64::LD2H_IMM: 3817 case AArch64::LD2W_IMM: 3818 case AArch64::LD2D_IMM: 3819 case AArch64::ST2B_IMM: 3820 case AArch64::ST2H_IMM: 3821 case AArch64::ST2W_IMM: 3822 case AArch64::ST2D_IMM: 3823 Scale = TypeSize::getScalable(32); 3824 Width = TypeSize::getScalable(16 * 2); 3825 MinOffset = -8; 3826 MaxOffset = 7; 3827 break; 3828 case AArch64::LD3B_IMM: 3829 case AArch64::LD3H_IMM: 3830 case AArch64::LD3W_IMM: 3831 case AArch64::LD3D_IMM: 3832 case AArch64::ST3B_IMM: 3833 case AArch64::ST3H_IMM: 3834 case AArch64::ST3W_IMM: 3835 case AArch64::ST3D_IMM: 3836 Scale = TypeSize::getScalable(48); 3837 Width = TypeSize::getScalable(16 * 3); 3838 MinOffset = -8; 3839 MaxOffset = 7; 3840 break; 3841 case AArch64::LD4B_IMM: 3842 case AArch64::LD4H_IMM: 3843 case AArch64::LD4W_IMM: 3844 case AArch64::LD4D_IMM: 3845 case AArch64::ST4B_IMM: 3846 case AArch64::ST4H_IMM: 3847 case AArch64::ST4W_IMM: 3848 case AArch64::ST4D_IMM: 3849 Scale = TypeSize::getScalable(64); 3850 Width = TypeSize::getScalable(16 * 4); 3851 MinOffset = -8; 3852 MaxOffset = 7; 3853 break; 3854 case AArch64::LD1B_H_IMM: 3855 case AArch64::LD1SB_H_IMM: 3856 case AArch64::LD1H_S_IMM: 3857 case AArch64::LD1SH_S_IMM: 3858 case AArch64::LD1W_D_IMM: 3859 case AArch64::LD1SW_D_IMM: 3860 case AArch64::ST1B_H_IMM: 3861 case AArch64::ST1H_S_IMM: 3862 case AArch64::ST1W_D_IMM: 3863 case AArch64::LDNF1B_H_IMM: 3864 case AArch64::LDNF1SB_H_IMM: 3865 case AArch64::LDNF1H_S_IMM: 3866 case AArch64::LDNF1SH_S_IMM: 3867 case AArch64::LDNF1W_D_IMM: 3868 case AArch64::LDNF1SW_D_IMM: 3869 // A half vector worth of data 3870 // Width = mbytes * elements 3871 Scale = TypeSize::getScalable(8); 3872 Width = TypeSize::getScalable(8); 3873 MinOffset = -8; 3874 MaxOffset = 7; 3875 break; 3876 case AArch64::LD1B_S_IMM: 3877 case AArch64::LD1SB_S_IMM: 3878 case AArch64::LD1H_D_IMM: 3879 case AArch64::LD1SH_D_IMM: 3880 case AArch64::ST1B_S_IMM: 3881 case AArch64::ST1H_D_IMM: 3882 case AArch64::LDNF1B_S_IMM: 3883 case AArch64::LDNF1SB_S_IMM: 3884 case AArch64::LDNF1H_D_IMM: 3885 case AArch64::LDNF1SH_D_IMM: 3886 // A quarter vector worth of data 3887 // Width = mbytes * elements 3888 Scale = TypeSize::getScalable(4); 3889 Width = TypeSize::getScalable(4); 3890 MinOffset = -8; 3891 MaxOffset = 7; 3892 break; 3893 case AArch64::LD1B_D_IMM: 3894 case AArch64::LD1SB_D_IMM: 3895 case AArch64::ST1B_D_IMM: 3896 case AArch64::LDNF1B_D_IMM: 3897 case AArch64::LDNF1SB_D_IMM: 3898 // A eighth vector worth of data 3899 // Width = mbytes * elements 3900 Scale = TypeSize::getScalable(2); 3901 Width = TypeSize::getScalable(2); 3902 MinOffset = -8; 3903 MaxOffset = 7; 3904 break; 3905 case AArch64::ST2Gi: 3906 case AArch64::STZ2Gi: 3907 Scale = TypeSize::getFixed(16); 3908 Width = TypeSize::getFixed(32); 3909 MinOffset = -256; 3910 MaxOffset = 255; 3911 break; 3912 case AArch64::STGPi: 3913 Scale = TypeSize::getFixed(16); 3914 Width = TypeSize::getFixed(16); 3915 MinOffset = -64; 3916 MaxOffset = 63; 3917 break; 3918 case AArch64::LD1RB_IMM: 3919 case AArch64::LD1RB_H_IMM: 3920 case AArch64::LD1RB_S_IMM: 3921 case AArch64::LD1RB_D_IMM: 3922 case AArch64::LD1RSB_H_IMM: 3923 case AArch64::LD1RSB_S_IMM: 3924 case AArch64::LD1RSB_D_IMM: 3925 Scale = TypeSize::getFixed(1); 3926 Width = TypeSize::getFixed(1); 3927 MinOffset = 0; 3928 MaxOffset = 63; 3929 break; 3930 case AArch64::LD1RH_IMM: 3931 case AArch64::LD1RH_S_IMM: 3932 case AArch64::LD1RH_D_IMM: 3933 case AArch64::LD1RSH_S_IMM: 3934 case AArch64::LD1RSH_D_IMM: 3935 Scale = TypeSize::getFixed(2); 3936 Width = TypeSize::getFixed(2); 3937 MinOffset = 0; 3938 MaxOffset = 63; 3939 break; 3940 case AArch64::LD1RW_IMM: 3941 case AArch64::LD1RW_D_IMM: 3942 case AArch64::LD1RSW_IMM: 3943 Scale = TypeSize::getFixed(4); 3944 Width = TypeSize::getFixed(4); 3945 MinOffset = 0; 3946 MaxOffset = 63; 3947 break; 3948 case AArch64::LD1RD_IMM: 3949 Scale = TypeSize::getFixed(8); 3950 Width = TypeSize::getFixed(8); 3951 MinOffset = 0; 3952 MaxOffset = 63; 3953 break; 3954 } 3955 3956 return true; 3957 } 3958 3959 // Scaling factor for unscaled load or store. 3960 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3961 switch (Opc) { 3962 default: 3963 llvm_unreachable("Opcode has unknown scale!"); 3964 case AArch64::LDRBBui: 3965 case AArch64::LDURBBi: 3966 case AArch64::LDRSBWui: 3967 case AArch64::LDURSBWi: 3968 case AArch64::STRBBui: 3969 case AArch64::STURBBi: 3970 return 1; 3971 case AArch64::LDRHHui: 3972 case AArch64::LDURHHi: 3973 case AArch64::LDRSHWui: 3974 case AArch64::LDURSHWi: 3975 case AArch64::STRHHui: 3976 case AArch64::STURHHi: 3977 return 2; 3978 case AArch64::LDRSui: 3979 case AArch64::LDURSi: 3980 case AArch64::LDRSpre: 3981 case AArch64::LDRSWui: 3982 case AArch64::LDURSWi: 3983 case AArch64::LDRSWpre: 3984 case AArch64::LDRWpre: 3985 case AArch64::LDRWui: 3986 case AArch64::LDURWi: 3987 case AArch64::STRSui: 3988 case AArch64::STURSi: 3989 case AArch64::STRSpre: 3990 case AArch64::STRWui: 3991 case AArch64::STURWi: 3992 case AArch64::STRWpre: 3993 case AArch64::LDPSi: 3994 case AArch64::LDPSWi: 3995 case AArch64::LDPWi: 3996 case AArch64::STPSi: 3997 case AArch64::STPWi: 3998 return 4; 3999 case AArch64::LDRDui: 4000 case AArch64::LDURDi: 4001 case AArch64::LDRDpre: 4002 case AArch64::LDRXui: 4003 case AArch64::LDURXi: 4004 case AArch64::LDRXpre: 4005 case AArch64::STRDui: 4006 case AArch64::STURDi: 4007 case AArch64::STRDpre: 4008 case AArch64::STRXui: 4009 case AArch64::STURXi: 4010 case AArch64::STRXpre: 4011 case AArch64::LDPDi: 4012 case AArch64::LDPXi: 4013 case AArch64::STPDi: 4014 case AArch64::STPXi: 4015 return 8; 4016 case AArch64::LDRQui: 4017 case AArch64::LDURQi: 4018 case AArch64::STRQui: 4019 case AArch64::STURQi: 4020 case AArch64::STRQpre: 4021 case AArch64::LDPQi: 4022 case AArch64::LDRQpre: 4023 case AArch64::STPQi: 4024 case AArch64::STGi: 4025 case AArch64::STZGi: 4026 case AArch64::ST2Gi: 4027 case AArch64::STZ2Gi: 4028 case AArch64::STGPi: 4029 return 16; 4030 } 4031 } 4032 4033 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 4034 switch (MI.getOpcode()) { 4035 default: 4036 return false; 4037 case AArch64::LDRWpre: 4038 case AArch64::LDRXpre: 4039 case AArch64::LDRSWpre: 4040 case AArch64::LDRSpre: 4041 case AArch64::LDRDpre: 4042 case AArch64::LDRQpre: 4043 return true; 4044 } 4045 } 4046 4047 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 4048 switch (MI.getOpcode()) { 4049 default: 4050 return false; 4051 case AArch64::STRWpre: 4052 case AArch64::STRXpre: 4053 case AArch64::STRSpre: 4054 case AArch64::STRDpre: 4055 case AArch64::STRQpre: 4056 return true; 4057 } 4058 } 4059 4060 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 4061 return isPreLd(MI) || isPreSt(MI); 4062 } 4063 4064 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 4065 switch (MI.getOpcode()) { 4066 default: 4067 return false; 4068 case AArch64::LDPSi: 4069 case AArch64::LDPSWi: 4070 case AArch64::LDPDi: 4071 case AArch64::LDPQi: 4072 case AArch64::LDPWi: 4073 case AArch64::LDPXi: 4074 case AArch64::STPSi: 4075 case AArch64::STPDi: 4076 case AArch64::STPQi: 4077 case AArch64::STPWi: 4078 case AArch64::STPXi: 4079 case AArch64::STGPi: 4080 return true; 4081 } 4082 } 4083 4084 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 4085 unsigned Idx = 4086 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 4087 : 1; 4088 return MI.getOperand(Idx); 4089 } 4090 4091 const MachineOperand & 4092 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 4093 unsigned Idx = 4094 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 4095 : 2; 4096 return MI.getOperand(Idx); 4097 } 4098 4099 const MachineOperand & 4100 AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) { 4101 switch (MI.getOpcode()) { 4102 default: 4103 llvm_unreachable("Unexpected opcode"); 4104 case AArch64::LDRBBroX: 4105 return MI.getOperand(4); 4106 } 4107 } 4108 4109 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 4110 Register Reg) { 4111 if (MI.getParent() == nullptr) 4112 return nullptr; 4113 const MachineFunction *MF = MI.getParent()->getParent(); 4114 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 4115 } 4116 4117 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 4118 auto IsHFPR = [&](const MachineOperand &Op) { 4119 if (!Op.isReg()) 4120 return false; 4121 auto Reg = Op.getReg(); 4122 if (Reg.isPhysical()) 4123 return AArch64::FPR16RegClass.contains(Reg); 4124 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4125 return TRC == &AArch64::FPR16RegClass || 4126 TRC == &AArch64::FPR16_loRegClass; 4127 }; 4128 return llvm::any_of(MI.operands(), IsHFPR); 4129 } 4130 4131 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 4132 auto IsQFPR = [&](const MachineOperand &Op) { 4133 if (!Op.isReg()) 4134 return false; 4135 auto Reg = Op.getReg(); 4136 if (Reg.isPhysical()) 4137 return AArch64::FPR128RegClass.contains(Reg); 4138 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4139 return TRC == &AArch64::FPR128RegClass || 4140 TRC == &AArch64::FPR128_loRegClass; 4141 }; 4142 return llvm::any_of(MI.operands(), IsQFPR); 4143 } 4144 4145 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) { 4146 switch (MI.getOpcode()) { 4147 case AArch64::BRK: 4148 case AArch64::HLT: 4149 case AArch64::PACIASP: 4150 case AArch64::PACIBSP: 4151 // Implicit BTI behavior. 4152 return true; 4153 case AArch64::PAUTH_PROLOGUE: 4154 // PAUTH_PROLOGUE expands to PACI(A|B)SP. 4155 return true; 4156 case AArch64::HINT: { 4157 unsigned Imm = MI.getOperand(0).getImm(); 4158 // Explicit BTI instruction. 4159 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 4160 return true; 4161 // PACI(A|B)SP instructions. 4162 if (Imm == 25 || Imm == 27) 4163 return true; 4164 return false; 4165 } 4166 default: 4167 return false; 4168 } 4169 } 4170 4171 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 4172 auto IsFPR = [&](const MachineOperand &Op) { 4173 if (!Op.isReg()) 4174 return false; 4175 auto Reg = Op.getReg(); 4176 if (Reg.isPhysical()) 4177 return AArch64::FPR128RegClass.contains(Reg) || 4178 AArch64::FPR64RegClass.contains(Reg) || 4179 AArch64::FPR32RegClass.contains(Reg) || 4180 AArch64::FPR16RegClass.contains(Reg) || 4181 AArch64::FPR8RegClass.contains(Reg); 4182 4183 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4184 return TRC == &AArch64::FPR128RegClass || 4185 TRC == &AArch64::FPR128_loRegClass || 4186 TRC == &AArch64::FPR64RegClass || 4187 TRC == &AArch64::FPR64_loRegClass || 4188 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 4189 TRC == &AArch64::FPR8RegClass; 4190 }; 4191 return llvm::any_of(MI.operands(), IsFPR); 4192 } 4193 4194 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 4195 // scaled. 4196 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 4197 int Scale = AArch64InstrInfo::getMemScale(Opc); 4198 4199 // If the byte-offset isn't a multiple of the stride, we can't scale this 4200 // offset. 4201 if (Offset % Scale != 0) 4202 return false; 4203 4204 // Convert the byte-offset used by unscaled into an "element" offset used 4205 // by the scaled pair load/store instructions. 4206 Offset /= Scale; 4207 return true; 4208 } 4209 4210 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 4211 if (FirstOpc == SecondOpc) 4212 return true; 4213 // We can also pair sign-ext and zero-ext instructions. 4214 switch (FirstOpc) { 4215 default: 4216 return false; 4217 case AArch64::LDRWui: 4218 case AArch64::LDURWi: 4219 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 4220 case AArch64::LDRSWui: 4221 case AArch64::LDURSWi: 4222 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 4223 } 4224 // These instructions can't be paired based on their opcodes. 4225 return false; 4226 } 4227 4228 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 4229 int64_t Offset1, unsigned Opcode1, int FI2, 4230 int64_t Offset2, unsigned Opcode2) { 4231 // Accesses through fixed stack object frame indices may access a different 4232 // fixed stack slot. Check that the object offsets + offsets match. 4233 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 4234 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 4235 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 4236 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 4237 // Convert to scaled object offsets. 4238 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 4239 if (ObjectOffset1 % Scale1 != 0) 4240 return false; 4241 ObjectOffset1 /= Scale1; 4242 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 4243 if (ObjectOffset2 % Scale2 != 0) 4244 return false; 4245 ObjectOffset2 /= Scale2; 4246 ObjectOffset1 += Offset1; 4247 ObjectOffset2 += Offset2; 4248 return ObjectOffset1 + 1 == ObjectOffset2; 4249 } 4250 4251 return FI1 == FI2; 4252 } 4253 4254 /// Detect opportunities for ldp/stp formation. 4255 /// 4256 /// Only called for LdSt for which getMemOperandWithOffset returns true. 4257 bool AArch64InstrInfo::shouldClusterMemOps( 4258 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1, 4259 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, 4260 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize, 4261 unsigned NumBytes) const { 4262 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 4263 const MachineOperand &BaseOp1 = *BaseOps1.front(); 4264 const MachineOperand &BaseOp2 = *BaseOps2.front(); 4265 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 4266 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 4267 if (BaseOp1.getType() != BaseOp2.getType()) 4268 return false; 4269 4270 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 4271 "Only base registers and frame indices are supported."); 4272 4273 // Check for both base regs and base FI. 4274 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 4275 return false; 4276 4277 // Only cluster up to a single pair. 4278 if (ClusterSize > 2) 4279 return false; 4280 4281 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 4282 return false; 4283 4284 // Can we pair these instructions based on their opcodes? 4285 unsigned FirstOpc = FirstLdSt.getOpcode(); 4286 unsigned SecondOpc = SecondLdSt.getOpcode(); 4287 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 4288 return false; 4289 4290 // Can't merge volatiles or load/stores that have a hint to avoid pair 4291 // formation, for example. 4292 if (!isCandidateToMergeOrPair(FirstLdSt) || 4293 !isCandidateToMergeOrPair(SecondLdSt)) 4294 return false; 4295 4296 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 4297 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 4298 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 4299 return false; 4300 4301 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 4302 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 4303 return false; 4304 4305 // Pairwise instructions have a 7-bit signed offset field. 4306 if (Offset1 > 63 || Offset1 < -64) 4307 return false; 4308 4309 // The caller should already have ordered First/SecondLdSt by offset. 4310 // Note: except for non-equal frame index bases 4311 if (BaseOp1.isFI()) { 4312 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 4313 "Caller should have ordered offsets."); 4314 4315 const MachineFrameInfo &MFI = 4316 FirstLdSt.getParent()->getParent()->getFrameInfo(); 4317 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 4318 BaseOp2.getIndex(), Offset2, SecondOpc); 4319 } 4320 4321 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 4322 4323 return Offset1 + 1 == Offset2; 4324 } 4325 4326 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 4327 unsigned Reg, unsigned SubIdx, 4328 unsigned State, 4329 const TargetRegisterInfo *TRI) { 4330 if (!SubIdx) 4331 return MIB.addReg(Reg, State); 4332 4333 if (Register::isPhysicalRegister(Reg)) 4334 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 4335 return MIB.addReg(Reg, State, SubIdx); 4336 } 4337 4338 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 4339 unsigned NumRegs) { 4340 // We really want the positive remainder mod 32 here, that happens to be 4341 // easily obtainable with a mask. 4342 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 4343 } 4344 4345 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 4346 MachineBasicBlock::iterator I, 4347 const DebugLoc &DL, MCRegister DestReg, 4348 MCRegister SrcReg, bool KillSrc, 4349 unsigned Opcode, 4350 ArrayRef<unsigned> Indices) const { 4351 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 4352 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4353 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4354 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4355 unsigned NumRegs = Indices.size(); 4356 4357 int SubReg = 0, End = NumRegs, Incr = 1; 4358 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 4359 SubReg = NumRegs - 1; 4360 End = -1; 4361 Incr = -1; 4362 } 4363 4364 for (; SubReg != End; SubReg += Incr) { 4365 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4366 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4367 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 4368 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4369 } 4370 } 4371 4372 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 4373 MachineBasicBlock::iterator I, 4374 DebugLoc DL, unsigned DestReg, 4375 unsigned SrcReg, bool KillSrc, 4376 unsigned Opcode, unsigned ZeroReg, 4377 llvm::ArrayRef<unsigned> Indices) const { 4378 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4379 unsigned NumRegs = Indices.size(); 4380 4381 #ifndef NDEBUG 4382 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4383 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4384 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 4385 "GPR reg sequences should not be able to overlap"); 4386 #endif 4387 4388 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 4389 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4390 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4391 MIB.addReg(ZeroReg); 4392 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4393 MIB.addImm(0); 4394 } 4395 } 4396 4397 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 4398 MachineBasicBlock::iterator I, 4399 const DebugLoc &DL, MCRegister DestReg, 4400 MCRegister SrcReg, bool KillSrc) const { 4401 if (AArch64::GPR32spRegClass.contains(DestReg) && 4402 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 4403 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4404 4405 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 4406 // If either operand is WSP, expand to ADD #0. 4407 if (Subtarget.hasZeroCycleRegMove()) { 4408 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 4409 MCRegister DestRegX = TRI->getMatchingSuperReg( 4410 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4411 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4412 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4413 // This instruction is reading and writing X registers. This may upset 4414 // the register scavenger and machine verifier, so we need to indicate 4415 // that we are reading an undefined value from SrcRegX, but a proper 4416 // value from SrcReg. 4417 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 4418 .addReg(SrcRegX, RegState::Undef) 4419 .addImm(0) 4420 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 4421 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4422 } else { 4423 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 4424 .addReg(SrcReg, getKillRegState(KillSrc)) 4425 .addImm(0) 4426 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4427 } 4428 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 4429 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 4430 .addImm(0) 4431 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4432 } else { 4433 if (Subtarget.hasZeroCycleRegMove()) { 4434 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 4435 MCRegister DestRegX = TRI->getMatchingSuperReg( 4436 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4437 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4438 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4439 // This instruction is reading and writing X registers. This may upset 4440 // the register scavenger and machine verifier, so we need to indicate 4441 // that we are reading an undefined value from SrcRegX, but a proper 4442 // value from SrcReg. 4443 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 4444 .addReg(AArch64::XZR) 4445 .addReg(SrcRegX, RegState::Undef) 4446 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4447 } else { 4448 // Otherwise, expand to ORR WZR. 4449 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 4450 .addReg(AArch64::WZR) 4451 .addReg(SrcReg, getKillRegState(KillSrc)); 4452 } 4453 } 4454 return; 4455 } 4456 4457 // Copy a Predicate register by ORRing with itself. 4458 if (AArch64::PPRRegClass.contains(DestReg) && 4459 AArch64::PPRRegClass.contains(SrcReg)) { 4460 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4461 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 4462 .addReg(SrcReg) // Pg 4463 .addReg(SrcReg) 4464 .addReg(SrcReg, getKillRegState(KillSrc)); 4465 return; 4466 } 4467 4468 // Copy a predicate-as-counter register by ORRing with itself as if it 4469 // were a regular predicate (mask) register. 4470 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); 4471 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); 4472 if (DestIsPNR || SrcIsPNR) { 4473 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4474 "Unexpected predicate-as-counter register."); 4475 auto ToPPR = [](MCRegister R) -> MCRegister { 4476 return (R - AArch64::PN0) + AArch64::P0; 4477 }; 4478 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg; 4479 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg; 4480 4481 if (PPRSrcReg != PPRDestReg) { 4482 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg) 4483 .addReg(PPRSrcReg) // Pg 4484 .addReg(PPRSrcReg) 4485 .addReg(PPRSrcReg, getKillRegState(KillSrc)); 4486 if (DestIsPNR) 4487 NewMI.addDef(DestReg, RegState::Implicit); 4488 } 4489 return; 4490 } 4491 4492 // Copy a Z register by ORRing with itself. 4493 if (AArch64::ZPRRegClass.contains(DestReg) && 4494 AArch64::ZPRRegClass.contains(SrcReg)) { 4495 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4496 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 4497 .addReg(SrcReg) 4498 .addReg(SrcReg, getKillRegState(KillSrc)); 4499 return; 4500 } 4501 4502 // Copy a Z register pair by copying the individual sub-registers. 4503 if ((AArch64::ZPR2RegClass.contains(DestReg) || 4504 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && 4505 (AArch64::ZPR2RegClass.contains(SrcReg) || 4506 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { 4507 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4508 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 4509 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4510 Indices); 4511 return; 4512 } 4513 4514 // Copy a Z register triple by copying the individual sub-registers. 4515 if (AArch64::ZPR3RegClass.contains(DestReg) && 4516 AArch64::ZPR3RegClass.contains(SrcReg)) { 4517 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4518 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4519 AArch64::zsub2}; 4520 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4521 Indices); 4522 return; 4523 } 4524 4525 // Copy a Z register quad by copying the individual sub-registers. 4526 if ((AArch64::ZPR4RegClass.contains(DestReg) || 4527 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && 4528 (AArch64::ZPR4RegClass.contains(SrcReg) || 4529 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { 4530 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4531 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4532 AArch64::zsub2, AArch64::zsub3}; 4533 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4534 Indices); 4535 return; 4536 } 4537 4538 if (AArch64::GPR64spRegClass.contains(DestReg) && 4539 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 4540 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 4541 // If either operand is SP, expand to ADD #0. 4542 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 4543 .addReg(SrcReg, getKillRegState(KillSrc)) 4544 .addImm(0) 4545 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4546 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 4547 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 4548 .addImm(0) 4549 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4550 } else { 4551 // Otherwise, expand to ORR XZR. 4552 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 4553 .addReg(AArch64::XZR) 4554 .addReg(SrcReg, getKillRegState(KillSrc)); 4555 } 4556 return; 4557 } 4558 4559 // Copy a DDDD register quad by copying the individual sub-registers. 4560 if (AArch64::DDDDRegClass.contains(DestReg) && 4561 AArch64::DDDDRegClass.contains(SrcReg)) { 4562 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4563 AArch64::dsub2, AArch64::dsub3}; 4564 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4565 Indices); 4566 return; 4567 } 4568 4569 // Copy a DDD register triple by copying the individual sub-registers. 4570 if (AArch64::DDDRegClass.contains(DestReg) && 4571 AArch64::DDDRegClass.contains(SrcReg)) { 4572 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4573 AArch64::dsub2}; 4574 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4575 Indices); 4576 return; 4577 } 4578 4579 // Copy a DD register pair by copying the individual sub-registers. 4580 if (AArch64::DDRegClass.contains(DestReg) && 4581 AArch64::DDRegClass.contains(SrcReg)) { 4582 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 4583 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4584 Indices); 4585 return; 4586 } 4587 4588 // Copy a QQQQ register quad by copying the individual sub-registers. 4589 if (AArch64::QQQQRegClass.contains(DestReg) && 4590 AArch64::QQQQRegClass.contains(SrcReg)) { 4591 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4592 AArch64::qsub2, AArch64::qsub3}; 4593 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4594 Indices); 4595 return; 4596 } 4597 4598 // Copy a QQQ register triple by copying the individual sub-registers. 4599 if (AArch64::QQQRegClass.contains(DestReg) && 4600 AArch64::QQQRegClass.contains(SrcReg)) { 4601 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4602 AArch64::qsub2}; 4603 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4604 Indices); 4605 return; 4606 } 4607 4608 // Copy a QQ register pair by copying the individual sub-registers. 4609 if (AArch64::QQRegClass.contains(DestReg) && 4610 AArch64::QQRegClass.contains(SrcReg)) { 4611 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 4612 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4613 Indices); 4614 return; 4615 } 4616 4617 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 4618 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 4619 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 4620 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 4621 AArch64::XZR, Indices); 4622 return; 4623 } 4624 4625 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 4626 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 4627 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 4628 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 4629 AArch64::WZR, Indices); 4630 return; 4631 } 4632 4633 if (AArch64::FPR128RegClass.contains(DestReg) && 4634 AArch64::FPR128RegClass.contains(SrcReg)) { 4635 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) 4636 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 4637 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 4638 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 4639 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 4640 else if (Subtarget.hasNEON()) 4641 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 4642 .addReg(SrcReg) 4643 .addReg(SrcReg, getKillRegState(KillSrc)); 4644 else { 4645 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 4646 .addReg(AArch64::SP, RegState::Define) 4647 .addReg(SrcReg, getKillRegState(KillSrc)) 4648 .addReg(AArch64::SP) 4649 .addImm(-16); 4650 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 4651 .addReg(AArch64::SP, RegState::Define) 4652 .addReg(DestReg, RegState::Define) 4653 .addReg(AArch64::SP) 4654 .addImm(16); 4655 } 4656 return; 4657 } 4658 4659 if (AArch64::FPR64RegClass.contains(DestReg) && 4660 AArch64::FPR64RegClass.contains(SrcReg)) { 4661 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 4662 .addReg(SrcReg, getKillRegState(KillSrc)); 4663 return; 4664 } 4665 4666 if (AArch64::FPR32RegClass.contains(DestReg) && 4667 AArch64::FPR32RegClass.contains(SrcReg)) { 4668 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4669 .addReg(SrcReg, getKillRegState(KillSrc)); 4670 return; 4671 } 4672 4673 if (AArch64::FPR16RegClass.contains(DestReg) && 4674 AArch64::FPR16RegClass.contains(SrcReg)) { 4675 DestReg = 4676 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 4677 SrcReg = 4678 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 4679 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4680 .addReg(SrcReg, getKillRegState(KillSrc)); 4681 return; 4682 } 4683 4684 if (AArch64::FPR8RegClass.contains(DestReg) && 4685 AArch64::FPR8RegClass.contains(SrcReg)) { 4686 DestReg = 4687 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 4688 SrcReg = 4689 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 4690 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4691 .addReg(SrcReg, getKillRegState(KillSrc)); 4692 return; 4693 } 4694 4695 // Copies between GPR64 and FPR64. 4696 if (AArch64::FPR64RegClass.contains(DestReg) && 4697 AArch64::GPR64RegClass.contains(SrcReg)) { 4698 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 4699 .addReg(SrcReg, getKillRegState(KillSrc)); 4700 return; 4701 } 4702 if (AArch64::GPR64RegClass.contains(DestReg) && 4703 AArch64::FPR64RegClass.contains(SrcReg)) { 4704 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 4705 .addReg(SrcReg, getKillRegState(KillSrc)); 4706 return; 4707 } 4708 // Copies between GPR32 and FPR32. 4709 if (AArch64::FPR32RegClass.contains(DestReg) && 4710 AArch64::GPR32RegClass.contains(SrcReg)) { 4711 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 4712 .addReg(SrcReg, getKillRegState(KillSrc)); 4713 return; 4714 } 4715 if (AArch64::GPR32RegClass.contains(DestReg) && 4716 AArch64::FPR32RegClass.contains(SrcReg)) { 4717 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 4718 .addReg(SrcReg, getKillRegState(KillSrc)); 4719 return; 4720 } 4721 4722 if (DestReg == AArch64::NZCV) { 4723 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 4724 BuildMI(MBB, I, DL, get(AArch64::MSR)) 4725 .addImm(AArch64SysReg::NZCV) 4726 .addReg(SrcReg, getKillRegState(KillSrc)) 4727 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 4728 return; 4729 } 4730 4731 if (SrcReg == AArch64::NZCV) { 4732 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 4733 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 4734 .addImm(AArch64SysReg::NZCV) 4735 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 4736 return; 4737 } 4738 4739 #ifndef NDEBUG 4740 const TargetRegisterInfo &TRI = getRegisterInfo(); 4741 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 4742 << TRI.getRegAsmName(SrcReg) << "\n"; 4743 #endif 4744 llvm_unreachable("unimplemented reg-to-reg copy"); 4745 } 4746 4747 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 4748 MachineBasicBlock &MBB, 4749 MachineBasicBlock::iterator InsertBefore, 4750 const MCInstrDesc &MCID, 4751 Register SrcReg, bool IsKill, 4752 unsigned SubIdx0, unsigned SubIdx1, int FI, 4753 MachineMemOperand *MMO) { 4754 Register SrcReg0 = SrcReg; 4755 Register SrcReg1 = SrcReg; 4756 if (SrcReg.isPhysical()) { 4757 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 4758 SubIdx0 = 0; 4759 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 4760 SubIdx1 = 0; 4761 } 4762 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4763 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 4764 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 4765 .addFrameIndex(FI) 4766 .addImm(0) 4767 .addMemOperand(MMO); 4768 } 4769 4770 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 4771 MachineBasicBlock::iterator MBBI, 4772 Register SrcReg, bool isKill, int FI, 4773 const TargetRegisterClass *RC, 4774 const TargetRegisterInfo *TRI, 4775 Register VReg) const { 4776 MachineFunction &MF = *MBB.getParent(); 4777 MachineFrameInfo &MFI = MF.getFrameInfo(); 4778 4779 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4780 MachineMemOperand *MMO = 4781 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 4782 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4783 unsigned Opc = 0; 4784 bool Offset = true; 4785 MCRegister PNRReg = MCRegister::NoRegister; 4786 unsigned StackID = TargetStackID::Default; 4787 switch (TRI->getSpillSize(*RC)) { 4788 case 1: 4789 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4790 Opc = AArch64::STRBui; 4791 break; 4792 case 2: 4793 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4794 Opc = AArch64::STRHui; 4795 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4796 assert(Subtarget.hasSVEorSME() && 4797 "Unexpected register store without SVE store instructions"); 4798 Opc = AArch64::STR_PXI; 4799 StackID = TargetStackID::ScalableVector; 4800 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { 4801 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4802 "Unexpected register store without SVE2p1 or SME2"); 4803 if (SrcReg.isVirtual()) { 4804 auto NewSrcReg = 4805 MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass); 4806 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg) 4807 .addReg(SrcReg); 4808 SrcReg = NewSrcReg; 4809 } else 4810 SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0; 4811 Opc = AArch64::STR_PXI; 4812 StackID = TargetStackID::ScalableVector; 4813 } 4814 break; 4815 case 4: 4816 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4817 Opc = AArch64::STRWui; 4818 if (SrcReg.isVirtual()) 4819 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 4820 else 4821 assert(SrcReg != AArch64::WSP); 4822 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4823 Opc = AArch64::STRSui; 4824 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 4825 Opc = AArch64::STR_PPXI; 4826 StackID = TargetStackID::ScalableVector; 4827 } 4828 break; 4829 case 8: 4830 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4831 Opc = AArch64::STRXui; 4832 if (SrcReg.isVirtual()) 4833 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4834 else 4835 assert(SrcReg != AArch64::SP); 4836 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4837 Opc = AArch64::STRDui; 4838 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4839 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4840 get(AArch64::STPWi), SrcReg, isKill, 4841 AArch64::sube32, AArch64::subo32, FI, MMO); 4842 return; 4843 } 4844 break; 4845 case 16: 4846 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4847 Opc = AArch64::STRQui; 4848 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4849 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4850 Opc = AArch64::ST1Twov1d; 4851 Offset = false; 4852 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4853 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4854 get(AArch64::STPXi), SrcReg, isKill, 4855 AArch64::sube64, AArch64::subo64, FI, MMO); 4856 return; 4857 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4858 assert(Subtarget.hasSVEorSME() && 4859 "Unexpected register store without SVE store instructions"); 4860 Opc = AArch64::STR_ZXI; 4861 StackID = TargetStackID::ScalableVector; 4862 } 4863 break; 4864 case 24: 4865 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4866 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4867 Opc = AArch64::ST1Threev1d; 4868 Offset = false; 4869 } 4870 break; 4871 case 32: 4872 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4873 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4874 Opc = AArch64::ST1Fourv1d; 4875 Offset = false; 4876 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4877 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4878 Opc = AArch64::ST1Twov2d; 4879 Offset = false; 4880 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 4881 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4882 assert(Subtarget.hasSVEorSME() && 4883 "Unexpected register store without SVE store instructions"); 4884 Opc = AArch64::STR_ZZXI; 4885 StackID = TargetStackID::ScalableVector; 4886 } 4887 break; 4888 case 48: 4889 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4890 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4891 Opc = AArch64::ST1Threev2d; 4892 Offset = false; 4893 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4894 assert(Subtarget.hasSVEorSME() && 4895 "Unexpected register store without SVE store instructions"); 4896 Opc = AArch64::STR_ZZZXI; 4897 StackID = TargetStackID::ScalableVector; 4898 } 4899 break; 4900 case 64: 4901 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4902 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4903 Opc = AArch64::ST1Fourv2d; 4904 Offset = false; 4905 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 4906 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4907 assert(Subtarget.hasSVEorSME() && 4908 "Unexpected register store without SVE store instructions"); 4909 Opc = AArch64::STR_ZZZZXI; 4910 StackID = TargetStackID::ScalableVector; 4911 } 4912 break; 4913 } 4914 assert(Opc && "Unknown register class"); 4915 MFI.setStackID(FI, StackID); 4916 4917 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4918 .addReg(SrcReg, getKillRegState(isKill)) 4919 .addFrameIndex(FI); 4920 4921 if (Offset) 4922 MI.addImm(0); 4923 if (PNRReg.isValid()) 4924 MI.addDef(PNRReg, RegState::Implicit); 4925 MI.addMemOperand(MMO); 4926 } 4927 4928 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 4929 MachineBasicBlock &MBB, 4930 MachineBasicBlock::iterator InsertBefore, 4931 const MCInstrDesc &MCID, 4932 Register DestReg, unsigned SubIdx0, 4933 unsigned SubIdx1, int FI, 4934 MachineMemOperand *MMO) { 4935 Register DestReg0 = DestReg; 4936 Register DestReg1 = DestReg; 4937 bool IsUndef = true; 4938 if (DestReg.isPhysical()) { 4939 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 4940 SubIdx0 = 0; 4941 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 4942 SubIdx1 = 0; 4943 IsUndef = false; 4944 } 4945 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4946 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 4947 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 4948 .addFrameIndex(FI) 4949 .addImm(0) 4950 .addMemOperand(MMO); 4951 } 4952 4953 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 4954 MachineBasicBlock::iterator MBBI, 4955 Register DestReg, int FI, 4956 const TargetRegisterClass *RC, 4957 const TargetRegisterInfo *TRI, 4958 Register VReg) const { 4959 MachineFunction &MF = *MBB.getParent(); 4960 MachineFrameInfo &MFI = MF.getFrameInfo(); 4961 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4962 MachineMemOperand *MMO = 4963 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 4964 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4965 4966 unsigned Opc = 0; 4967 bool Offset = true; 4968 unsigned StackID = TargetStackID::Default; 4969 Register PNRReg = MCRegister::NoRegister; 4970 switch (TRI->getSpillSize(*RC)) { 4971 case 1: 4972 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4973 Opc = AArch64::LDRBui; 4974 break; 4975 case 2: 4976 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4977 Opc = AArch64::LDRHui; 4978 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4979 assert(Subtarget.hasSVEorSME() && 4980 "Unexpected register load without SVE load instructions"); 4981 Opc = AArch64::LDR_PXI; 4982 StackID = TargetStackID::ScalableVector; 4983 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { 4984 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4985 "Unexpected register load without SVE2p1 or SME2"); 4986 PNRReg = DestReg; 4987 if (DestReg.isVirtual()) 4988 DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass); 4989 else 4990 DestReg = (DestReg - AArch64::PN0) + AArch64::P0; 4991 Opc = AArch64::LDR_PXI; 4992 StackID = TargetStackID::ScalableVector; 4993 } 4994 break; 4995 case 4: 4996 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4997 Opc = AArch64::LDRWui; 4998 if (DestReg.isVirtual()) 4999 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 5000 else 5001 assert(DestReg != AArch64::WSP); 5002 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5003 Opc = AArch64::LDRSui; 5004 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 5005 Opc = AArch64::LDR_PPXI; 5006 StackID = TargetStackID::ScalableVector; 5007 } 5008 break; 5009 case 8: 5010 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 5011 Opc = AArch64::LDRXui; 5012 if (DestReg.isVirtual()) 5013 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 5014 else 5015 assert(DestReg != AArch64::SP); 5016 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 5017 Opc = AArch64::LDRDui; 5018 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 5019 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5020 get(AArch64::LDPWi), DestReg, AArch64::sube32, 5021 AArch64::subo32, FI, MMO); 5022 return; 5023 } 5024 break; 5025 case 16: 5026 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 5027 Opc = AArch64::LDRQui; 5028 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 5029 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5030 Opc = AArch64::LD1Twov1d; 5031 Offset = false; 5032 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 5033 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5034 get(AArch64::LDPXi), DestReg, AArch64::sube64, 5035 AArch64::subo64, FI, MMO); 5036 return; 5037 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 5038 assert(Subtarget.hasSVEorSME() && 5039 "Unexpected register load without SVE load instructions"); 5040 Opc = AArch64::LDR_ZXI; 5041 StackID = TargetStackID::ScalableVector; 5042 } 5043 break; 5044 case 24: 5045 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 5046 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5047 Opc = AArch64::LD1Threev1d; 5048 Offset = false; 5049 } 5050 break; 5051 case 32: 5052 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 5053 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5054 Opc = AArch64::LD1Fourv1d; 5055 Offset = false; 5056 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 5057 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5058 Opc = AArch64::LD1Twov2d; 5059 Offset = false; 5060 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 5061 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5062 assert(Subtarget.hasSVEorSME() && 5063 "Unexpected register load without SVE load instructions"); 5064 Opc = AArch64::LDR_ZZXI; 5065 StackID = TargetStackID::ScalableVector; 5066 } 5067 break; 5068 case 48: 5069 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 5070 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5071 Opc = AArch64::LD1Threev2d; 5072 Offset = false; 5073 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 5074 assert(Subtarget.hasSVEorSME() && 5075 "Unexpected register load without SVE load instructions"); 5076 Opc = AArch64::LDR_ZZZXI; 5077 StackID = TargetStackID::ScalableVector; 5078 } 5079 break; 5080 case 64: 5081 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 5082 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5083 Opc = AArch64::LD1Fourv2d; 5084 Offset = false; 5085 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 5086 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5087 assert(Subtarget.hasSVEorSME() && 5088 "Unexpected register load without SVE load instructions"); 5089 Opc = AArch64::LDR_ZZZZXI; 5090 StackID = TargetStackID::ScalableVector; 5091 } 5092 break; 5093 } 5094 5095 assert(Opc && "Unknown register class"); 5096 MFI.setStackID(FI, StackID); 5097 5098 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 5099 .addReg(DestReg, getDefRegState(true)) 5100 .addFrameIndex(FI); 5101 if (Offset) 5102 MI.addImm(0); 5103 if (PNRReg.isValid() && !PNRReg.isVirtual()) 5104 MI.addDef(PNRReg, RegState::Implicit); 5105 MI.addMemOperand(MMO); 5106 5107 if (PNRReg.isValid() && PNRReg.isVirtual()) 5108 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg) 5109 .addReg(DestReg); 5110 } 5111 5112 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 5113 const MachineInstr &UseMI, 5114 const TargetRegisterInfo *TRI) { 5115 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 5116 UseMI.getIterator()), 5117 [TRI](const MachineInstr &I) { 5118 return I.modifiesRegister(AArch64::NZCV, TRI) || 5119 I.readsRegister(AArch64::NZCV, TRI); 5120 }); 5121 } 5122 5123 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5124 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 5125 // The smallest scalable element supported by scaled SVE addressing 5126 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5127 // byte offset must always be a multiple of 2. 5128 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5129 5130 // VGSized offsets are divided by '2', because the VG register is the 5131 // the number of 64bit granules as opposed to 128bit vector chunks, 5132 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 5133 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 5134 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 5135 ByteSized = Offset.getFixed(); 5136 VGSized = Offset.getScalable() / 2; 5137 } 5138 5139 /// Returns the offset in parts to which this frame offset can be 5140 /// decomposed for the purpose of describing a frame offset. 5141 /// For non-scalable offsets this is simply its byte size. 5142 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5143 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 5144 int64_t &NumDataVectors) { 5145 // The smallest scalable element supported by scaled SVE addressing 5146 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5147 // byte offset must always be a multiple of 2. 5148 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5149 5150 NumBytes = Offset.getFixed(); 5151 NumDataVectors = 0; 5152 NumPredicateVectors = Offset.getScalable() / 2; 5153 // This method is used to get the offsets to adjust the frame offset. 5154 // If the function requires ADDPL to be used and needs more than two ADDPL 5155 // instructions, part of the offset is folded into NumDataVectors so that it 5156 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 5157 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 5158 NumPredicateVectors > 62) { 5159 NumDataVectors = NumPredicateVectors / 8; 5160 NumPredicateVectors -= NumDataVectors * 8; 5161 } 5162 } 5163 5164 // Convenience function to create a DWARF expression for 5165 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 5166 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 5167 int NumVGScaledBytes, unsigned VG, 5168 llvm::raw_string_ostream &Comment) { 5169 uint8_t buffer[16]; 5170 5171 if (NumBytes) { 5172 Expr.push_back(dwarf::DW_OP_consts); 5173 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 5174 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5175 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 5176 } 5177 5178 if (NumVGScaledBytes) { 5179 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 5180 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 5181 5182 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 5183 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 5184 Expr.push_back(0); 5185 5186 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 5187 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5188 5189 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 5190 << std::abs(NumVGScaledBytes) << " * VG"; 5191 } 5192 } 5193 5194 // Creates an MCCFIInstruction: 5195 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 5196 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 5197 unsigned Reg, 5198 const StackOffset &Offset) { 5199 int64_t NumBytes, NumVGScaledBytes; 5200 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 5201 NumVGScaledBytes); 5202 std::string CommentBuffer; 5203 llvm::raw_string_ostream Comment(CommentBuffer); 5204 5205 if (Reg == AArch64::SP) 5206 Comment << "sp"; 5207 else if (Reg == AArch64::FP) 5208 Comment << "fp"; 5209 else 5210 Comment << printReg(Reg, &TRI); 5211 5212 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 5213 SmallString<64> Expr; 5214 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5215 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 5216 Expr.push_back(0); 5217 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 5218 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5219 5220 // Wrap this into DW_CFA_def_cfa. 5221 SmallString<64> DefCfaExpr; 5222 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 5223 uint8_t buffer[16]; 5224 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 5225 DefCfaExpr.append(Expr.str()); 5226 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 5227 Comment.str()); 5228 } 5229 5230 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 5231 unsigned FrameReg, unsigned Reg, 5232 const StackOffset &Offset, 5233 bool LastAdjustmentWasScalable) { 5234 if (Offset.getScalable()) 5235 return createDefCFAExpression(TRI, Reg, Offset); 5236 5237 if (FrameReg == Reg && !LastAdjustmentWasScalable) 5238 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 5239 5240 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5241 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 5242 } 5243 5244 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 5245 unsigned Reg, 5246 const StackOffset &OffsetFromDefCFA) { 5247 int64_t NumBytes, NumVGScaledBytes; 5248 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5249 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 5250 5251 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5252 5253 // Non-scalable offsets can use DW_CFA_offset directly. 5254 if (!NumVGScaledBytes) 5255 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 5256 5257 std::string CommentBuffer; 5258 llvm::raw_string_ostream Comment(CommentBuffer); 5259 Comment << printReg(Reg, &TRI) << " @ cfa"; 5260 5261 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 5262 SmallString<64> OffsetExpr; 5263 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 5264 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5265 5266 // Wrap this into DW_CFA_expression 5267 SmallString<64> CfaExpr; 5268 CfaExpr.push_back(dwarf::DW_CFA_expression); 5269 uint8_t buffer[16]; 5270 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 5271 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 5272 CfaExpr.append(OffsetExpr.str()); 5273 5274 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 5275 Comment.str()); 5276 } 5277 5278 // Helper function to emit a frame offset adjustment from a given 5279 // pointer (SrcReg), stored into DestReg. This function is explicit 5280 // in that it requires the opcode. 5281 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 5282 MachineBasicBlock::iterator MBBI, 5283 const DebugLoc &DL, unsigned DestReg, 5284 unsigned SrcReg, int64_t Offset, unsigned Opc, 5285 const TargetInstrInfo *TII, 5286 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 5287 bool *HasWinCFI, bool EmitCFAOffset, 5288 StackOffset CFAOffset, unsigned FrameReg) { 5289 int Sign = 1; 5290 unsigned MaxEncoding, ShiftSize; 5291 switch (Opc) { 5292 case AArch64::ADDXri: 5293 case AArch64::ADDSXri: 5294 case AArch64::SUBXri: 5295 case AArch64::SUBSXri: 5296 MaxEncoding = 0xfff; 5297 ShiftSize = 12; 5298 break; 5299 case AArch64::ADDVL_XXI: 5300 case AArch64::ADDPL_XXI: 5301 case AArch64::ADDSVL_XXI: 5302 case AArch64::ADDSPL_XXI: 5303 MaxEncoding = 31; 5304 ShiftSize = 0; 5305 if (Offset < 0) { 5306 MaxEncoding = 32; 5307 Sign = -1; 5308 Offset = -Offset; 5309 } 5310 break; 5311 default: 5312 llvm_unreachable("Unsupported opcode"); 5313 } 5314 5315 // `Offset` can be in bytes or in "scalable bytes". 5316 int VScale = 1; 5317 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 5318 VScale = 16; 5319 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 5320 VScale = 2; 5321 5322 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 5323 // scratch register. If DestReg is a virtual register, use it as the 5324 // scratch register; otherwise, create a new virtual register (to be 5325 // replaced by the scavenger at the end of PEI). That case can be optimized 5326 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 5327 // register can be loaded with offset%8 and the add/sub can use an extending 5328 // instruction with LSL#3. 5329 // Currently the function handles any offsets but generates a poor sequence 5330 // of code. 5331 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 5332 5333 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 5334 Register TmpReg = DestReg; 5335 if (TmpReg == AArch64::XZR) 5336 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 5337 &AArch64::GPR64RegClass); 5338 do { 5339 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 5340 unsigned LocalShiftSize = 0; 5341 if (ThisVal > MaxEncoding) { 5342 ThisVal = ThisVal >> ShiftSize; 5343 LocalShiftSize = ShiftSize; 5344 } 5345 assert((ThisVal >> ShiftSize) <= MaxEncoding && 5346 "Encoding cannot handle value that big"); 5347 5348 Offset -= ThisVal << LocalShiftSize; 5349 if (Offset == 0) 5350 TmpReg = DestReg; 5351 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 5352 .addReg(SrcReg) 5353 .addImm(Sign * (int)ThisVal); 5354 if (ShiftSize) 5355 MBI = MBI.addImm( 5356 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 5357 MBI = MBI.setMIFlag(Flag); 5358 5359 auto Change = 5360 VScale == 1 5361 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 5362 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 5363 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 5364 CFAOffset += Change; 5365 else 5366 CFAOffset -= Change; 5367 if (EmitCFAOffset && DestReg == TmpReg) { 5368 MachineFunction &MF = *MBB.getParent(); 5369 const TargetSubtargetInfo &STI = MF.getSubtarget(); 5370 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 5371 5372 unsigned CFIIndex = MF.addFrameInst( 5373 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 5374 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 5375 .addCFIIndex(CFIIndex) 5376 .setMIFlags(Flag); 5377 } 5378 5379 if (NeedsWinCFI) { 5380 assert(Sign == 1 && "SEH directives should always have a positive sign"); 5381 int Imm = (int)(ThisVal << LocalShiftSize); 5382 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 5383 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 5384 if (HasWinCFI) 5385 *HasWinCFI = true; 5386 if (Imm == 0) 5387 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 5388 else 5389 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 5390 .addImm(Imm) 5391 .setMIFlag(Flag); 5392 assert(Offset == 0 && "Expected remaining offset to be zero to " 5393 "emit a single SEH directive"); 5394 } else if (DestReg == AArch64::SP) { 5395 if (HasWinCFI) 5396 *HasWinCFI = true; 5397 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 5398 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 5399 .addImm(Imm) 5400 .setMIFlag(Flag); 5401 } 5402 } 5403 5404 SrcReg = TmpReg; 5405 } while (Offset); 5406 } 5407 5408 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 5409 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 5410 unsigned DestReg, unsigned SrcReg, 5411 StackOffset Offset, const TargetInstrInfo *TII, 5412 MachineInstr::MIFlag Flag, bool SetNZCV, 5413 bool NeedsWinCFI, bool *HasWinCFI, 5414 bool EmitCFAOffset, StackOffset CFAOffset, 5415 unsigned FrameReg) { 5416 // If a function is marked as arm_locally_streaming, then the runtime value of 5417 // vscale in the prologue/epilogue is different the runtime value of vscale 5418 // in the function's body. To avoid having to consider multiple vscales, 5419 // we can use `addsvl` to allocate any scalable stack-slots, which under 5420 // most circumstances will be only locals, not callee-save slots. 5421 const Function &F = MBB.getParent()->getFunction(); 5422 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 5423 5424 int64_t Bytes, NumPredicateVectors, NumDataVectors; 5425 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5426 Offset, Bytes, NumPredicateVectors, NumDataVectors); 5427 5428 // First emit non-scalable frame offsets, or a simple 'mov'. 5429 if (Bytes || (!Offset && SrcReg != DestReg)) { 5430 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 5431 "SP increment/decrement not 8-byte aligned"); 5432 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 5433 if (Bytes < 0) { 5434 Bytes = -Bytes; 5435 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 5436 } 5437 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 5438 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 5439 FrameReg); 5440 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 5441 ? StackOffset::getFixed(-Bytes) 5442 : StackOffset::getFixed(Bytes); 5443 SrcReg = DestReg; 5444 FrameReg = DestReg; 5445 } 5446 5447 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 5448 "SetNZCV not supported with SVE vectors"); 5449 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 5450 "WinCFI not supported with SVE vectors"); 5451 5452 if (NumDataVectors) { 5453 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 5454 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 5455 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5456 CFAOffset, FrameReg); 5457 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 5458 SrcReg = DestReg; 5459 } 5460 5461 if (NumPredicateVectors) { 5462 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 5463 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 5464 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 5465 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5466 CFAOffset, FrameReg); 5467 } 5468 } 5469 5470 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 5471 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 5472 MachineBasicBlock::iterator InsertPt, int FrameIndex, 5473 LiveIntervals *LIS, VirtRegMap *VRM) const { 5474 // This is a bit of a hack. Consider this instruction: 5475 // 5476 // %0 = COPY %sp; GPR64all:%0 5477 // 5478 // We explicitly chose GPR64all for the virtual register so such a copy might 5479 // be eliminated by RegisterCoalescer. However, that may not be possible, and 5480 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 5481 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 5482 // 5483 // To prevent that, we are going to constrain the %0 register class here. 5484 if (MI.isFullCopy()) { 5485 Register DstReg = MI.getOperand(0).getReg(); 5486 Register SrcReg = MI.getOperand(1).getReg(); 5487 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 5488 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 5489 return nullptr; 5490 } 5491 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 5492 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 5493 return nullptr; 5494 } 5495 // Nothing can folded with copy from/to NZCV. 5496 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 5497 return nullptr; 5498 } 5499 5500 // Handle the case where a copy is being spilled or filled but the source 5501 // and destination register class don't match. For example: 5502 // 5503 // %0 = COPY %xzr; GPR64common:%0 5504 // 5505 // In this case we can still safely fold away the COPY and generate the 5506 // following spill code: 5507 // 5508 // STRXui %xzr, %stack.0 5509 // 5510 // This also eliminates spilled cross register class COPYs (e.g. between x and 5511 // d regs) of the same size. For example: 5512 // 5513 // %0 = COPY %1; GPR64:%0, FPR64:%1 5514 // 5515 // will be filled as 5516 // 5517 // LDRDui %0, fi<#0> 5518 // 5519 // instead of 5520 // 5521 // LDRXui %Temp, fi<#0> 5522 // %0 = FMOV %Temp 5523 // 5524 if (MI.isCopy() && Ops.size() == 1 && 5525 // Make sure we're only folding the explicit COPY defs/uses. 5526 (Ops[0] == 0 || Ops[0] == 1)) { 5527 bool IsSpill = Ops[0] == 0; 5528 bool IsFill = !IsSpill; 5529 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 5530 const MachineRegisterInfo &MRI = MF.getRegInfo(); 5531 MachineBasicBlock &MBB = *MI.getParent(); 5532 const MachineOperand &DstMO = MI.getOperand(0); 5533 const MachineOperand &SrcMO = MI.getOperand(1); 5534 Register DstReg = DstMO.getReg(); 5535 Register SrcReg = SrcMO.getReg(); 5536 // This is slightly expensive to compute for physical regs since 5537 // getMinimalPhysRegClass is slow. 5538 auto getRegClass = [&](unsigned Reg) { 5539 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 5540 : TRI.getMinimalPhysRegClass(Reg); 5541 }; 5542 5543 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 5544 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 5545 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 5546 "Mismatched register size in non subreg COPY"); 5547 if (IsSpill) 5548 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 5549 getRegClass(SrcReg), &TRI, Register()); 5550 else 5551 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 5552 getRegClass(DstReg), &TRI, Register()); 5553 return &*--InsertPt; 5554 } 5555 5556 // Handle cases like spilling def of: 5557 // 5558 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 5559 // 5560 // where the physical register source can be widened and stored to the full 5561 // virtual reg destination stack slot, in this case producing: 5562 // 5563 // STRXui %xzr, %stack.0 5564 // 5565 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR && 5566 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) { 5567 assert(SrcMO.getSubReg() == 0 && 5568 "Unexpected subreg on physical register"); 5569 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), 5570 FrameIndex, &AArch64::GPR64RegClass, &TRI, 5571 Register()); 5572 return &*--InsertPt; 5573 } 5574 5575 // Handle cases like filling use of: 5576 // 5577 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 5578 // 5579 // where we can load the full virtual reg source stack slot, into the subreg 5580 // destination, in this case producing: 5581 // 5582 // LDRWui %0:sub_32<def,read-undef>, %stack.0 5583 // 5584 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 5585 const TargetRegisterClass *FillRC; 5586 switch (DstMO.getSubReg()) { 5587 default: 5588 FillRC = nullptr; 5589 break; 5590 case AArch64::sub_32: 5591 FillRC = &AArch64::GPR32RegClass; 5592 break; 5593 case AArch64::ssub: 5594 FillRC = &AArch64::FPR32RegClass; 5595 break; 5596 case AArch64::dsub: 5597 FillRC = &AArch64::FPR64RegClass; 5598 break; 5599 } 5600 5601 if (FillRC) { 5602 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 5603 TRI.getRegSizeInBits(*FillRC) && 5604 "Mismatched regclass size on folded subreg COPY"); 5605 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 5606 Register()); 5607 MachineInstr &LoadMI = *--InsertPt; 5608 MachineOperand &LoadDst = LoadMI.getOperand(0); 5609 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 5610 LoadDst.setSubReg(DstMO.getSubReg()); 5611 LoadDst.setIsUndef(); 5612 return &LoadMI; 5613 } 5614 } 5615 } 5616 5617 // Cannot fold. 5618 return nullptr; 5619 } 5620 5621 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 5622 StackOffset &SOffset, 5623 bool *OutUseUnscaledOp, 5624 unsigned *OutUnscaledOp, 5625 int64_t *EmittableOffset) { 5626 // Set output values in case of early exit. 5627 if (EmittableOffset) 5628 *EmittableOffset = 0; 5629 if (OutUseUnscaledOp) 5630 *OutUseUnscaledOp = false; 5631 if (OutUnscaledOp) 5632 *OutUnscaledOp = 0; 5633 5634 // Exit early for structured vector spills/fills as they can't take an 5635 // immediate offset. 5636 switch (MI.getOpcode()) { 5637 default: 5638 break; 5639 case AArch64::LD1Rv1d: 5640 case AArch64::LD1Rv2s: 5641 case AArch64::LD1Rv2d: 5642 case AArch64::LD1Rv4h: 5643 case AArch64::LD1Rv4s: 5644 case AArch64::LD1Rv8b: 5645 case AArch64::LD1Rv8h: 5646 case AArch64::LD1Rv16b: 5647 case AArch64::LD1Twov2d: 5648 case AArch64::LD1Threev2d: 5649 case AArch64::LD1Fourv2d: 5650 case AArch64::LD1Twov1d: 5651 case AArch64::LD1Threev1d: 5652 case AArch64::LD1Fourv1d: 5653 case AArch64::ST1Twov2d: 5654 case AArch64::ST1Threev2d: 5655 case AArch64::ST1Fourv2d: 5656 case AArch64::ST1Twov1d: 5657 case AArch64::ST1Threev1d: 5658 case AArch64::ST1Fourv1d: 5659 case AArch64::ST1i8: 5660 case AArch64::ST1i16: 5661 case AArch64::ST1i32: 5662 case AArch64::ST1i64: 5663 case AArch64::IRG: 5664 case AArch64::IRGstack: 5665 case AArch64::STGloop: 5666 case AArch64::STZGloop: 5667 return AArch64FrameOffsetCannotUpdate; 5668 } 5669 5670 // Get the min/max offset and the scale. 5671 TypeSize ScaleValue(0U, false), Width(0U, false); 5672 int64_t MinOff, MaxOff; 5673 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 5674 MaxOff)) 5675 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5676 5677 // Construct the complete offset. 5678 bool IsMulVL = ScaleValue.isScalable(); 5679 unsigned Scale = ScaleValue.getKnownMinValue(); 5680 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 5681 5682 const MachineOperand &ImmOpnd = 5683 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 5684 Offset += ImmOpnd.getImm() * Scale; 5685 5686 // If the offset doesn't match the scale, we rewrite the instruction to 5687 // use the unscaled instruction instead. Likewise, if we have a negative 5688 // offset and there is an unscaled op to use. 5689 std::optional<unsigned> UnscaledOp = 5690 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 5691 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 5692 if (useUnscaledOp && 5693 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 5694 MaxOff)) 5695 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5696 5697 Scale = ScaleValue.getKnownMinValue(); 5698 assert(IsMulVL == ScaleValue.isScalable() && 5699 "Unscaled opcode has different value for scalable"); 5700 5701 int64_t Remainder = Offset % Scale; 5702 assert(!(Remainder && useUnscaledOp) && 5703 "Cannot have remainder when using unscaled op"); 5704 5705 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 5706 int64_t NewOffset = Offset / Scale; 5707 if (MinOff <= NewOffset && NewOffset <= MaxOff) 5708 Offset = Remainder; 5709 else { 5710 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 5711 Offset = Offset - NewOffset * Scale; 5712 } 5713 5714 if (EmittableOffset) 5715 *EmittableOffset = NewOffset; 5716 if (OutUseUnscaledOp) 5717 *OutUseUnscaledOp = useUnscaledOp; 5718 if (OutUnscaledOp && UnscaledOp) 5719 *OutUnscaledOp = *UnscaledOp; 5720 5721 if (IsMulVL) 5722 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 5723 else 5724 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 5725 return AArch64FrameOffsetCanUpdate | 5726 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 5727 } 5728 5729 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 5730 unsigned FrameReg, StackOffset &Offset, 5731 const AArch64InstrInfo *TII) { 5732 unsigned Opcode = MI.getOpcode(); 5733 unsigned ImmIdx = FrameRegIdx + 1; 5734 5735 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 5736 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 5737 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 5738 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 5739 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 5740 MI.eraseFromParent(); 5741 Offset = StackOffset(); 5742 return true; 5743 } 5744 5745 int64_t NewOffset; 5746 unsigned UnscaledOp; 5747 bool UseUnscaledOp; 5748 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 5749 &UnscaledOp, &NewOffset); 5750 if (Status & AArch64FrameOffsetCanUpdate) { 5751 if (Status & AArch64FrameOffsetIsLegal) 5752 // Replace the FrameIndex with FrameReg. 5753 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 5754 if (UseUnscaledOp) 5755 MI.setDesc(TII->get(UnscaledOp)); 5756 5757 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 5758 return !Offset; 5759 } 5760 5761 return false; 5762 } 5763 5764 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB, 5765 MachineBasicBlock::iterator MI) const { 5766 DebugLoc DL; 5767 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0); 5768 } 5769 5770 MCInst AArch64InstrInfo::getNop() const { 5771 return MCInstBuilder(AArch64::HINT).addImm(0); 5772 } 5773 5774 // AArch64 supports MachineCombiner. 5775 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 5776 5777 // True when Opc sets flag 5778 static bool isCombineInstrSettingFlag(unsigned Opc) { 5779 switch (Opc) { 5780 case AArch64::ADDSWrr: 5781 case AArch64::ADDSWri: 5782 case AArch64::ADDSXrr: 5783 case AArch64::ADDSXri: 5784 case AArch64::SUBSWrr: 5785 case AArch64::SUBSXrr: 5786 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5787 case AArch64::SUBSWri: 5788 case AArch64::SUBSXri: 5789 return true; 5790 default: 5791 break; 5792 } 5793 return false; 5794 } 5795 5796 // 32b Opcodes that can be combined with a MUL 5797 static bool isCombineInstrCandidate32(unsigned Opc) { 5798 switch (Opc) { 5799 case AArch64::ADDWrr: 5800 case AArch64::ADDWri: 5801 case AArch64::SUBWrr: 5802 case AArch64::ADDSWrr: 5803 case AArch64::ADDSWri: 5804 case AArch64::SUBSWrr: 5805 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5806 case AArch64::SUBWri: 5807 case AArch64::SUBSWri: 5808 return true; 5809 default: 5810 break; 5811 } 5812 return false; 5813 } 5814 5815 // 64b Opcodes that can be combined with a MUL 5816 static bool isCombineInstrCandidate64(unsigned Opc) { 5817 switch (Opc) { 5818 case AArch64::ADDXrr: 5819 case AArch64::ADDXri: 5820 case AArch64::SUBXrr: 5821 case AArch64::ADDSXrr: 5822 case AArch64::ADDSXri: 5823 case AArch64::SUBSXrr: 5824 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5825 case AArch64::SUBXri: 5826 case AArch64::SUBSXri: 5827 case AArch64::ADDv8i8: 5828 case AArch64::ADDv16i8: 5829 case AArch64::ADDv4i16: 5830 case AArch64::ADDv8i16: 5831 case AArch64::ADDv2i32: 5832 case AArch64::ADDv4i32: 5833 case AArch64::SUBv8i8: 5834 case AArch64::SUBv16i8: 5835 case AArch64::SUBv4i16: 5836 case AArch64::SUBv8i16: 5837 case AArch64::SUBv2i32: 5838 case AArch64::SUBv4i32: 5839 return true; 5840 default: 5841 break; 5842 } 5843 return false; 5844 } 5845 5846 // FP Opcodes that can be combined with a FMUL. 5847 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 5848 switch (Inst.getOpcode()) { 5849 default: 5850 break; 5851 case AArch64::FADDHrr: 5852 case AArch64::FADDSrr: 5853 case AArch64::FADDDrr: 5854 case AArch64::FADDv4f16: 5855 case AArch64::FADDv8f16: 5856 case AArch64::FADDv2f32: 5857 case AArch64::FADDv2f64: 5858 case AArch64::FADDv4f32: 5859 case AArch64::FSUBHrr: 5860 case AArch64::FSUBSrr: 5861 case AArch64::FSUBDrr: 5862 case AArch64::FSUBv4f16: 5863 case AArch64::FSUBv8f16: 5864 case AArch64::FSUBv2f32: 5865 case AArch64::FSUBv2f64: 5866 case AArch64::FSUBv4f32: 5867 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 5868 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 5869 // the target options or if FADD/FSUB has the contract fast-math flag. 5870 return Options.UnsafeFPMath || 5871 Options.AllowFPOpFusion == FPOpFusion::Fast || 5872 Inst.getFlag(MachineInstr::FmContract); 5873 return true; 5874 } 5875 return false; 5876 } 5877 5878 // Opcodes that can be combined with a MUL 5879 static bool isCombineInstrCandidate(unsigned Opc) { 5880 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 5881 } 5882 5883 // 5884 // Utility routine that checks if \param MO is defined by an 5885 // \param CombineOpc instruction in the basic block \param MBB 5886 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 5887 unsigned CombineOpc, unsigned ZeroReg = 0, 5888 bool CheckZeroReg = false) { 5889 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5890 MachineInstr *MI = nullptr; 5891 5892 if (MO.isReg() && MO.getReg().isVirtual()) 5893 MI = MRI.getUniqueVRegDef(MO.getReg()); 5894 // And it needs to be in the trace (otherwise, it won't have a depth). 5895 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 5896 return false; 5897 // Must only used by the user we combine with. 5898 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 5899 return false; 5900 5901 if (CheckZeroReg) { 5902 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 5903 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 5904 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 5905 // The third input reg must be zero. 5906 if (MI->getOperand(3).getReg() != ZeroReg) 5907 return false; 5908 } 5909 5910 if (isCombineInstrSettingFlag(CombineOpc) && 5911 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 5912 return false; 5913 5914 return true; 5915 } 5916 5917 // 5918 // Is \param MO defined by an integer multiply and can be combined? 5919 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5920 unsigned MulOpc, unsigned ZeroReg) { 5921 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 5922 } 5923 5924 // 5925 // Is \param MO defined by a floating-point multiply and can be combined? 5926 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5927 unsigned MulOpc) { 5928 return canCombine(MBB, MO, MulOpc); 5929 } 5930 5931 // TODO: There are many more machine instruction opcodes to match: 5932 // 1. Other data types (integer, vectors) 5933 // 2. Other math / logic operations (xor, or) 5934 // 3. Other forms of the same operation (intrinsics and other variants) 5935 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 5936 bool Invert) const { 5937 if (Invert) 5938 return false; 5939 switch (Inst.getOpcode()) { 5940 // == Floating-point types == 5941 // -- Floating-point instructions -- 5942 case AArch64::FADDHrr: 5943 case AArch64::FADDSrr: 5944 case AArch64::FADDDrr: 5945 case AArch64::FMULHrr: 5946 case AArch64::FMULSrr: 5947 case AArch64::FMULDrr: 5948 case AArch64::FMULX16: 5949 case AArch64::FMULX32: 5950 case AArch64::FMULX64: 5951 // -- Advanced SIMD instructions -- 5952 case AArch64::FADDv4f16: 5953 case AArch64::FADDv8f16: 5954 case AArch64::FADDv2f32: 5955 case AArch64::FADDv4f32: 5956 case AArch64::FADDv2f64: 5957 case AArch64::FMULv4f16: 5958 case AArch64::FMULv8f16: 5959 case AArch64::FMULv2f32: 5960 case AArch64::FMULv4f32: 5961 case AArch64::FMULv2f64: 5962 case AArch64::FMULXv4f16: 5963 case AArch64::FMULXv8f16: 5964 case AArch64::FMULXv2f32: 5965 case AArch64::FMULXv4f32: 5966 case AArch64::FMULXv2f64: 5967 // -- SVE instructions -- 5968 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 5969 // in the SVE instruction set (though there are predicated ones). 5970 case AArch64::FADD_ZZZ_H: 5971 case AArch64::FADD_ZZZ_S: 5972 case AArch64::FADD_ZZZ_D: 5973 case AArch64::FMUL_ZZZ_H: 5974 case AArch64::FMUL_ZZZ_S: 5975 case AArch64::FMUL_ZZZ_D: 5976 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 5977 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 5978 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 5979 5980 // == Integer types == 5981 // -- Base instructions -- 5982 // Opcodes MULWrr and MULXrr don't exist because 5983 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 5984 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 5985 // The machine-combiner does not support three-source-operands machine 5986 // instruction. So we cannot reassociate MULs. 5987 case AArch64::ADDWrr: 5988 case AArch64::ADDXrr: 5989 case AArch64::ANDWrr: 5990 case AArch64::ANDXrr: 5991 case AArch64::ORRWrr: 5992 case AArch64::ORRXrr: 5993 case AArch64::EORWrr: 5994 case AArch64::EORXrr: 5995 case AArch64::EONWrr: 5996 case AArch64::EONXrr: 5997 // -- Advanced SIMD instructions -- 5998 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 5999 // in the Advanced SIMD instruction set. 6000 case AArch64::ADDv8i8: 6001 case AArch64::ADDv16i8: 6002 case AArch64::ADDv4i16: 6003 case AArch64::ADDv8i16: 6004 case AArch64::ADDv2i32: 6005 case AArch64::ADDv4i32: 6006 case AArch64::ADDv1i64: 6007 case AArch64::ADDv2i64: 6008 case AArch64::MULv8i8: 6009 case AArch64::MULv16i8: 6010 case AArch64::MULv4i16: 6011 case AArch64::MULv8i16: 6012 case AArch64::MULv2i32: 6013 case AArch64::MULv4i32: 6014 case AArch64::ANDv8i8: 6015 case AArch64::ANDv16i8: 6016 case AArch64::ORRv8i8: 6017 case AArch64::ORRv16i8: 6018 case AArch64::EORv8i8: 6019 case AArch64::EORv16i8: 6020 // -- SVE instructions -- 6021 case AArch64::ADD_ZZZ_B: 6022 case AArch64::ADD_ZZZ_H: 6023 case AArch64::ADD_ZZZ_S: 6024 case AArch64::ADD_ZZZ_D: 6025 case AArch64::MUL_ZZZ_B: 6026 case AArch64::MUL_ZZZ_H: 6027 case AArch64::MUL_ZZZ_S: 6028 case AArch64::MUL_ZZZ_D: 6029 case AArch64::AND_ZZZ: 6030 case AArch64::ORR_ZZZ: 6031 case AArch64::EOR_ZZZ: 6032 return true; 6033 6034 default: 6035 return false; 6036 } 6037 } 6038 6039 /// Find instructions that can be turned into madd. 6040 static bool getMaddPatterns(MachineInstr &Root, 6041 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6042 unsigned Opc = Root.getOpcode(); 6043 MachineBasicBlock &MBB = *Root.getParent(); 6044 bool Found = false; 6045 6046 if (!isCombineInstrCandidate(Opc)) 6047 return false; 6048 if (isCombineInstrSettingFlag(Opc)) { 6049 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 6050 // When NZCV is live bail out. 6051 if (Cmp_NZCV == -1) 6052 return false; 6053 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 6054 // When opcode can't change bail out. 6055 // CHECKME: do we miss any cases for opcode conversion? 6056 if (NewOpc == Opc) 6057 return false; 6058 Opc = NewOpc; 6059 } 6060 6061 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 6062 MachineCombinerPattern Pattern) { 6063 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 6064 Patterns.push_back(Pattern); 6065 Found = true; 6066 } 6067 }; 6068 6069 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 6070 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 6071 Patterns.push_back(Pattern); 6072 Found = true; 6073 } 6074 }; 6075 6076 typedef MachineCombinerPattern MCP; 6077 6078 switch (Opc) { 6079 default: 6080 break; 6081 case AArch64::ADDWrr: 6082 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6083 "ADDWrr does not have register operands"); 6084 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 6085 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 6086 break; 6087 case AArch64::ADDXrr: 6088 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 6089 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 6090 break; 6091 case AArch64::SUBWrr: 6092 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 6093 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 6094 break; 6095 case AArch64::SUBXrr: 6096 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 6097 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 6098 break; 6099 case AArch64::ADDWri: 6100 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 6101 break; 6102 case AArch64::ADDXri: 6103 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 6104 break; 6105 case AArch64::SUBWri: 6106 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 6107 break; 6108 case AArch64::SUBXri: 6109 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 6110 break; 6111 case AArch64::ADDv8i8: 6112 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 6113 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 6114 break; 6115 case AArch64::ADDv16i8: 6116 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 6117 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 6118 break; 6119 case AArch64::ADDv4i16: 6120 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 6121 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 6122 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 6123 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 6124 break; 6125 case AArch64::ADDv8i16: 6126 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 6127 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 6128 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 6129 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 6130 break; 6131 case AArch64::ADDv2i32: 6132 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 6133 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 6134 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 6135 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 6136 break; 6137 case AArch64::ADDv4i32: 6138 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 6139 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 6140 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 6141 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 6142 break; 6143 case AArch64::SUBv8i8: 6144 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 6145 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 6146 break; 6147 case AArch64::SUBv16i8: 6148 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 6149 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 6150 break; 6151 case AArch64::SUBv4i16: 6152 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 6153 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 6154 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 6155 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 6156 break; 6157 case AArch64::SUBv8i16: 6158 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 6159 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 6160 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 6161 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 6162 break; 6163 case AArch64::SUBv2i32: 6164 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 6165 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 6166 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 6167 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 6168 break; 6169 case AArch64::SUBv4i32: 6170 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 6171 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 6172 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 6173 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 6174 break; 6175 } 6176 return Found; 6177 } 6178 /// Floating-Point Support 6179 6180 /// Find instructions that can be turned into madd. 6181 static bool getFMAPatterns(MachineInstr &Root, 6182 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6183 6184 if (!isCombineInstrCandidateFP(Root)) 6185 return false; 6186 6187 MachineBasicBlock &MBB = *Root.getParent(); 6188 bool Found = false; 6189 6190 auto Match = [&](int Opcode, int Operand, 6191 MachineCombinerPattern Pattern) -> bool { 6192 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 6193 Patterns.push_back(Pattern); 6194 return true; 6195 } 6196 return false; 6197 }; 6198 6199 typedef MachineCombinerPattern MCP; 6200 6201 switch (Root.getOpcode()) { 6202 default: 6203 assert(false && "Unsupported FP instruction in combiner\n"); 6204 break; 6205 case AArch64::FADDHrr: 6206 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6207 "FADDHrr does not have register operands"); 6208 6209 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 6210 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 6211 break; 6212 case AArch64::FADDSrr: 6213 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6214 "FADDSrr does not have register operands"); 6215 6216 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 6217 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 6218 6219 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 6220 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 6221 break; 6222 case AArch64::FADDDrr: 6223 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 6224 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 6225 6226 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 6227 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 6228 break; 6229 case AArch64::FADDv4f16: 6230 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 6231 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 6232 6233 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 6234 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 6235 break; 6236 case AArch64::FADDv8f16: 6237 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 6238 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 6239 6240 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 6241 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 6242 break; 6243 case AArch64::FADDv2f32: 6244 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 6245 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 6246 6247 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 6248 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 6249 break; 6250 case AArch64::FADDv2f64: 6251 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 6252 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 6253 6254 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 6255 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 6256 break; 6257 case AArch64::FADDv4f32: 6258 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 6259 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 6260 6261 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 6262 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 6263 break; 6264 case AArch64::FSUBHrr: 6265 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 6266 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 6267 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 6268 break; 6269 case AArch64::FSUBSrr: 6270 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 6271 6272 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 6273 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 6274 6275 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 6276 break; 6277 case AArch64::FSUBDrr: 6278 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 6279 6280 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 6281 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 6282 6283 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 6284 break; 6285 case AArch64::FSUBv4f16: 6286 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 6287 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 6288 6289 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 6290 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 6291 break; 6292 case AArch64::FSUBv8f16: 6293 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 6294 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 6295 6296 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 6297 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 6298 break; 6299 case AArch64::FSUBv2f32: 6300 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 6301 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 6302 6303 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 6304 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 6305 break; 6306 case AArch64::FSUBv2f64: 6307 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 6308 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 6309 6310 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 6311 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 6312 break; 6313 case AArch64::FSUBv4f32: 6314 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 6315 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 6316 6317 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 6318 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 6319 break; 6320 } 6321 return Found; 6322 } 6323 6324 static bool getFMULPatterns(MachineInstr &Root, 6325 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6326 MachineBasicBlock &MBB = *Root.getParent(); 6327 bool Found = false; 6328 6329 auto Match = [&](unsigned Opcode, int Operand, 6330 MachineCombinerPattern Pattern) -> bool { 6331 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6332 MachineOperand &MO = Root.getOperand(Operand); 6333 MachineInstr *MI = nullptr; 6334 if (MO.isReg() && MO.getReg().isVirtual()) 6335 MI = MRI.getUniqueVRegDef(MO.getReg()); 6336 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 6337 if (MI && MI->getOpcode() == TargetOpcode::COPY && 6338 MI->getOperand(1).getReg().isVirtual()) 6339 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 6340 if (MI && MI->getOpcode() == Opcode) { 6341 Patterns.push_back(Pattern); 6342 return true; 6343 } 6344 return false; 6345 }; 6346 6347 typedef MachineCombinerPattern MCP; 6348 6349 switch (Root.getOpcode()) { 6350 default: 6351 return false; 6352 case AArch64::FMULv2f32: 6353 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 6354 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 6355 break; 6356 case AArch64::FMULv2f64: 6357 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 6358 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 6359 break; 6360 case AArch64::FMULv4f16: 6361 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 6362 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 6363 break; 6364 case AArch64::FMULv4f32: 6365 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 6366 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 6367 break; 6368 case AArch64::FMULv8f16: 6369 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 6370 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 6371 break; 6372 } 6373 6374 return Found; 6375 } 6376 6377 static bool getFNEGPatterns(MachineInstr &Root, 6378 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6379 unsigned Opc = Root.getOpcode(); 6380 MachineBasicBlock &MBB = *Root.getParent(); 6381 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6382 6383 auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool { 6384 MachineOperand &MO = Root.getOperand(1); 6385 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 6386 if (MI != nullptr && (MI->getOpcode() == Opcode) && 6387 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 6388 Root.getFlag(MachineInstr::MIFlag::FmContract) && 6389 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 6390 MI->getFlag(MachineInstr::MIFlag::FmContract) && 6391 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 6392 Patterns.push_back(Pattern); 6393 return true; 6394 } 6395 return false; 6396 }; 6397 6398 switch (Opc) { 6399 default: 6400 break; 6401 case AArch64::FNEGDr: 6402 return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD); 6403 case AArch64::FNEGSr: 6404 return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD); 6405 } 6406 6407 return false; 6408 } 6409 6410 /// Return true when a code sequence can improve throughput. It 6411 /// should be called only for instructions in loops. 6412 /// \param Pattern - combiner pattern 6413 bool AArch64InstrInfo::isThroughputPattern( 6414 MachineCombinerPattern Pattern) const { 6415 switch (Pattern) { 6416 default: 6417 break; 6418 case MachineCombinerPattern::FMULADDH_OP1: 6419 case MachineCombinerPattern::FMULADDH_OP2: 6420 case MachineCombinerPattern::FMULSUBH_OP1: 6421 case MachineCombinerPattern::FMULSUBH_OP2: 6422 case MachineCombinerPattern::FMULADDS_OP1: 6423 case MachineCombinerPattern::FMULADDS_OP2: 6424 case MachineCombinerPattern::FMULSUBS_OP1: 6425 case MachineCombinerPattern::FMULSUBS_OP2: 6426 case MachineCombinerPattern::FMULADDD_OP1: 6427 case MachineCombinerPattern::FMULADDD_OP2: 6428 case MachineCombinerPattern::FMULSUBD_OP1: 6429 case MachineCombinerPattern::FMULSUBD_OP2: 6430 case MachineCombinerPattern::FNMULSUBH_OP1: 6431 case MachineCombinerPattern::FNMULSUBS_OP1: 6432 case MachineCombinerPattern::FNMULSUBD_OP1: 6433 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6434 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6435 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6436 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6437 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6438 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6439 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6440 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6441 case MachineCombinerPattern::FMLAv4f16_OP2: 6442 case MachineCombinerPattern::FMLAv4f16_OP1: 6443 case MachineCombinerPattern::FMLAv8f16_OP1: 6444 case MachineCombinerPattern::FMLAv8f16_OP2: 6445 case MachineCombinerPattern::FMLAv2f32_OP2: 6446 case MachineCombinerPattern::FMLAv2f32_OP1: 6447 case MachineCombinerPattern::FMLAv2f64_OP1: 6448 case MachineCombinerPattern::FMLAv2f64_OP2: 6449 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6450 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6451 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6452 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6453 case MachineCombinerPattern::FMLAv4f32_OP1: 6454 case MachineCombinerPattern::FMLAv4f32_OP2: 6455 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6456 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6457 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 6458 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6459 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 6460 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6461 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6462 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6463 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6464 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6465 case MachineCombinerPattern::FMLSv4f16_OP1: 6466 case MachineCombinerPattern::FMLSv4f16_OP2: 6467 case MachineCombinerPattern::FMLSv8f16_OP1: 6468 case MachineCombinerPattern::FMLSv8f16_OP2: 6469 case MachineCombinerPattern::FMLSv2f32_OP2: 6470 case MachineCombinerPattern::FMLSv2f64_OP2: 6471 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6472 case MachineCombinerPattern::FMLSv4f32_OP2: 6473 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 6474 case MachineCombinerPattern::FMULv2i32_indexed_OP2: 6475 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 6476 case MachineCombinerPattern::FMULv2i64_indexed_OP2: 6477 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 6478 case MachineCombinerPattern::FMULv4i16_indexed_OP2: 6479 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 6480 case MachineCombinerPattern::FMULv4i32_indexed_OP2: 6481 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 6482 case MachineCombinerPattern::FMULv8i16_indexed_OP2: 6483 case MachineCombinerPattern::MULADDv8i8_OP1: 6484 case MachineCombinerPattern::MULADDv8i8_OP2: 6485 case MachineCombinerPattern::MULADDv16i8_OP1: 6486 case MachineCombinerPattern::MULADDv16i8_OP2: 6487 case MachineCombinerPattern::MULADDv4i16_OP1: 6488 case MachineCombinerPattern::MULADDv4i16_OP2: 6489 case MachineCombinerPattern::MULADDv8i16_OP1: 6490 case MachineCombinerPattern::MULADDv8i16_OP2: 6491 case MachineCombinerPattern::MULADDv2i32_OP1: 6492 case MachineCombinerPattern::MULADDv2i32_OP2: 6493 case MachineCombinerPattern::MULADDv4i32_OP1: 6494 case MachineCombinerPattern::MULADDv4i32_OP2: 6495 case MachineCombinerPattern::MULSUBv8i8_OP1: 6496 case MachineCombinerPattern::MULSUBv8i8_OP2: 6497 case MachineCombinerPattern::MULSUBv16i8_OP1: 6498 case MachineCombinerPattern::MULSUBv16i8_OP2: 6499 case MachineCombinerPattern::MULSUBv4i16_OP1: 6500 case MachineCombinerPattern::MULSUBv4i16_OP2: 6501 case MachineCombinerPattern::MULSUBv8i16_OP1: 6502 case MachineCombinerPattern::MULSUBv8i16_OP2: 6503 case MachineCombinerPattern::MULSUBv2i32_OP1: 6504 case MachineCombinerPattern::MULSUBv2i32_OP2: 6505 case MachineCombinerPattern::MULSUBv4i32_OP1: 6506 case MachineCombinerPattern::MULSUBv4i32_OP2: 6507 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6508 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6509 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6510 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6511 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6512 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6513 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6514 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6515 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6516 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6517 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6518 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6519 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6520 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6521 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6522 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6523 return true; 6524 } // end switch (Pattern) 6525 return false; 6526 } 6527 6528 /// Find other MI combine patterns. 6529 static bool getMiscPatterns(MachineInstr &Root, 6530 SmallVectorImpl<MachineCombinerPattern> &Patterns) 6531 { 6532 // A - (B + C) ==> (A - B) - C or (A - C) - B 6533 unsigned Opc = Root.getOpcode(); 6534 MachineBasicBlock &MBB = *Root.getParent(); 6535 6536 switch (Opc) { 6537 case AArch64::SUBWrr: 6538 case AArch64::SUBSWrr: 6539 case AArch64::SUBXrr: 6540 case AArch64::SUBSXrr: 6541 // Found candidate root. 6542 break; 6543 default: 6544 return false; 6545 } 6546 6547 if (isCombineInstrSettingFlag(Opc) && 6548 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 6549 return false; 6550 6551 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 6552 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 6553 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 6554 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 6555 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); 6556 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); 6557 return true; 6558 } 6559 6560 return false; 6561 } 6562 6563 /// Return true when there is potentially a faster code sequence for an 6564 /// instruction chain ending in \p Root. All potential patterns are listed in 6565 /// the \p Pattern vector. Pattern should be sorted in priority order since the 6566 /// pattern evaluator stops checking as soon as it finds a faster sequence. 6567 6568 bool AArch64InstrInfo::getMachineCombinerPatterns( 6569 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 6570 bool DoRegPressureReduce) const { 6571 // Integer patterns 6572 if (getMaddPatterns(Root, Patterns)) 6573 return true; 6574 // Floating point patterns 6575 if (getFMULPatterns(Root, Patterns)) 6576 return true; 6577 if (getFMAPatterns(Root, Patterns)) 6578 return true; 6579 if (getFNEGPatterns(Root, Patterns)) 6580 return true; 6581 6582 // Other patterns 6583 if (getMiscPatterns(Root, Patterns)) 6584 return true; 6585 6586 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 6587 DoRegPressureReduce); 6588 } 6589 6590 enum class FMAInstKind { Default, Indexed, Accumulator }; 6591 /// genFusedMultiply - Generate fused multiply instructions. 6592 /// This function supports both integer and floating point instructions. 6593 /// A typical example: 6594 /// F|MUL I=A,B,0 6595 /// F|ADD R,I,C 6596 /// ==> F|MADD R,A,B,C 6597 /// \param MF Containing MachineFunction 6598 /// \param MRI Register information 6599 /// \param TII Target information 6600 /// \param Root is the F|ADD instruction 6601 /// \param [out] InsInstrs is a vector of machine instructions and will 6602 /// contain the generated madd instruction 6603 /// \param IdxMulOpd is index of operand in Root that is the result of 6604 /// the F|MUL. In the example above IdxMulOpd is 1. 6605 /// \param MaddOpc the opcode fo the f|madd instruction 6606 /// \param RC Register class of operands 6607 /// \param kind of fma instruction (addressing mode) to be generated 6608 /// \param ReplacedAddend is the result register from the instruction 6609 /// replacing the non-combined operand, if any. 6610 static MachineInstr * 6611 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 6612 const TargetInstrInfo *TII, MachineInstr &Root, 6613 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 6614 unsigned MaddOpc, const TargetRegisterClass *RC, 6615 FMAInstKind kind = FMAInstKind::Default, 6616 const Register *ReplacedAddend = nullptr) { 6617 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6618 6619 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 6620 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6621 Register ResultReg = Root.getOperand(0).getReg(); 6622 Register SrcReg0 = MUL->getOperand(1).getReg(); 6623 bool Src0IsKill = MUL->getOperand(1).isKill(); 6624 Register SrcReg1 = MUL->getOperand(2).getReg(); 6625 bool Src1IsKill = MUL->getOperand(2).isKill(); 6626 6627 Register SrcReg2; 6628 bool Src2IsKill; 6629 if (ReplacedAddend) { 6630 // If we just generated a new addend, we must be it's only use. 6631 SrcReg2 = *ReplacedAddend; 6632 Src2IsKill = true; 6633 } else { 6634 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 6635 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 6636 } 6637 6638 if (ResultReg.isVirtual()) 6639 MRI.constrainRegClass(ResultReg, RC); 6640 if (SrcReg0.isVirtual()) 6641 MRI.constrainRegClass(SrcReg0, RC); 6642 if (SrcReg1.isVirtual()) 6643 MRI.constrainRegClass(SrcReg1, RC); 6644 if (SrcReg2.isVirtual()) 6645 MRI.constrainRegClass(SrcReg2, RC); 6646 6647 MachineInstrBuilder MIB; 6648 if (kind == FMAInstKind::Default) 6649 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6650 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6651 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6652 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6653 else if (kind == FMAInstKind::Indexed) 6654 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6655 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6656 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6657 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6658 .addImm(MUL->getOperand(3).getImm()); 6659 else if (kind == FMAInstKind::Accumulator) 6660 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6661 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6662 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6663 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 6664 else 6665 assert(false && "Invalid FMA instruction kind \n"); 6666 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 6667 InsInstrs.push_back(MIB); 6668 return MUL; 6669 } 6670 6671 static MachineInstr * 6672 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 6673 const TargetInstrInfo *TII, MachineInstr &Root, 6674 SmallVectorImpl<MachineInstr *> &InsInstrs) { 6675 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 6676 6677 unsigned Opc = 0; 6678 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 6679 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 6680 Opc = AArch64::FNMADDSrrr; 6681 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 6682 Opc = AArch64::FNMADDDrrr; 6683 else 6684 return nullptr; 6685 6686 Register ResultReg = Root.getOperand(0).getReg(); 6687 Register SrcReg0 = MAD->getOperand(1).getReg(); 6688 Register SrcReg1 = MAD->getOperand(2).getReg(); 6689 Register SrcReg2 = MAD->getOperand(3).getReg(); 6690 bool Src0IsKill = MAD->getOperand(1).isKill(); 6691 bool Src1IsKill = MAD->getOperand(2).isKill(); 6692 bool Src2IsKill = MAD->getOperand(3).isKill(); 6693 if (ResultReg.isVirtual()) 6694 MRI.constrainRegClass(ResultReg, RC); 6695 if (SrcReg0.isVirtual()) 6696 MRI.constrainRegClass(SrcReg0, RC); 6697 if (SrcReg1.isVirtual()) 6698 MRI.constrainRegClass(SrcReg1, RC); 6699 if (SrcReg2.isVirtual()) 6700 MRI.constrainRegClass(SrcReg2, RC); 6701 6702 MachineInstrBuilder MIB = 6703 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 6704 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6705 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6706 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6707 InsInstrs.push_back(MIB); 6708 6709 return MAD; 6710 } 6711 6712 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 6713 static MachineInstr * 6714 genIndexedMultiply(MachineInstr &Root, 6715 SmallVectorImpl<MachineInstr *> &InsInstrs, 6716 unsigned IdxDupOp, unsigned MulOpc, 6717 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 6718 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 6719 "Invalid index of FMUL operand"); 6720 6721 MachineFunction &MF = *Root.getMF(); 6722 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6723 6724 MachineInstr *Dup = 6725 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 6726 6727 if (Dup->getOpcode() == TargetOpcode::COPY) 6728 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 6729 6730 Register DupSrcReg = Dup->getOperand(1).getReg(); 6731 MRI.clearKillFlags(DupSrcReg); 6732 MRI.constrainRegClass(DupSrcReg, RC); 6733 6734 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 6735 6736 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 6737 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 6738 6739 Register ResultReg = Root.getOperand(0).getReg(); 6740 6741 MachineInstrBuilder MIB; 6742 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 6743 .add(MulOp) 6744 .addReg(DupSrcReg) 6745 .addImm(DupSrcLane); 6746 6747 InsInstrs.push_back(MIB); 6748 return &Root; 6749 } 6750 6751 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 6752 /// instructions. 6753 /// 6754 /// \see genFusedMultiply 6755 static MachineInstr *genFusedMultiplyAcc( 6756 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6757 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6758 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6759 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6760 FMAInstKind::Accumulator); 6761 } 6762 6763 /// genNeg - Helper to generate an intermediate negation of the second operand 6764 /// of Root 6765 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 6766 const TargetInstrInfo *TII, MachineInstr &Root, 6767 SmallVectorImpl<MachineInstr *> &InsInstrs, 6768 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 6769 unsigned MnegOpc, const TargetRegisterClass *RC) { 6770 Register NewVR = MRI.createVirtualRegister(RC); 6771 MachineInstrBuilder MIB = 6772 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 6773 .add(Root.getOperand(2)); 6774 InsInstrs.push_back(MIB); 6775 6776 assert(InstrIdxForVirtReg.empty()); 6777 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6778 6779 return NewVR; 6780 } 6781 6782 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6783 /// instructions with an additional negation of the accumulator 6784 static MachineInstr *genFusedMultiplyAccNeg( 6785 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6786 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6787 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6788 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6789 assert(IdxMulOpd == 1); 6790 6791 Register NewVR = 6792 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6793 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6794 FMAInstKind::Accumulator, &NewVR); 6795 } 6796 6797 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 6798 /// instructions. 6799 /// 6800 /// \see genFusedMultiply 6801 static MachineInstr *genFusedMultiplyIdx( 6802 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6803 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6804 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6805 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6806 FMAInstKind::Indexed); 6807 } 6808 6809 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6810 /// instructions with an additional negation of the accumulator 6811 static MachineInstr *genFusedMultiplyIdxNeg( 6812 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6813 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6814 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6815 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6816 assert(IdxMulOpd == 1); 6817 6818 Register NewVR = 6819 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6820 6821 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6822 FMAInstKind::Indexed, &NewVR); 6823 } 6824 6825 /// genMaddR - Generate madd instruction and combine mul and add using 6826 /// an extra virtual register 6827 /// Example - an ADD intermediate needs to be stored in a register: 6828 /// MUL I=A,B,0 6829 /// ADD R,I,Imm 6830 /// ==> ORR V, ZR, Imm 6831 /// ==> MADD R,A,B,V 6832 /// \param MF Containing MachineFunction 6833 /// \param MRI Register information 6834 /// \param TII Target information 6835 /// \param Root is the ADD instruction 6836 /// \param [out] InsInstrs is a vector of machine instructions and will 6837 /// contain the generated madd instruction 6838 /// \param IdxMulOpd is index of operand in Root that is the result of 6839 /// the MUL. In the example above IdxMulOpd is 1. 6840 /// \param MaddOpc the opcode fo the madd instruction 6841 /// \param VR is a virtual register that holds the value of an ADD operand 6842 /// (V in the example above). 6843 /// \param RC Register class of operands 6844 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 6845 const TargetInstrInfo *TII, MachineInstr &Root, 6846 SmallVectorImpl<MachineInstr *> &InsInstrs, 6847 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 6848 const TargetRegisterClass *RC) { 6849 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6850 6851 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6852 Register ResultReg = Root.getOperand(0).getReg(); 6853 Register SrcReg0 = MUL->getOperand(1).getReg(); 6854 bool Src0IsKill = MUL->getOperand(1).isKill(); 6855 Register SrcReg1 = MUL->getOperand(2).getReg(); 6856 bool Src1IsKill = MUL->getOperand(2).isKill(); 6857 6858 if (ResultReg.isVirtual()) 6859 MRI.constrainRegClass(ResultReg, RC); 6860 if (SrcReg0.isVirtual()) 6861 MRI.constrainRegClass(SrcReg0, RC); 6862 if (SrcReg1.isVirtual()) 6863 MRI.constrainRegClass(SrcReg1, RC); 6864 if (Register::isVirtualRegister(VR)) 6865 MRI.constrainRegClass(VR, RC); 6866 6867 MachineInstrBuilder MIB = 6868 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6869 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6870 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6871 .addReg(VR); 6872 // Insert the MADD 6873 InsInstrs.push_back(MIB); 6874 return MUL; 6875 } 6876 6877 /// Do the following transformation 6878 /// A - (B + C) ==> (A - B) - C 6879 /// A - (B + C) ==> (A - C) - B 6880 static void 6881 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 6882 const TargetInstrInfo *TII, MachineInstr &Root, 6883 SmallVectorImpl<MachineInstr *> &InsInstrs, 6884 SmallVectorImpl<MachineInstr *> &DelInstrs, 6885 unsigned IdxOpd1, 6886 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 6887 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 6888 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 6889 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 6890 6891 Register ResultReg = Root.getOperand(0).getReg(); 6892 Register RegA = Root.getOperand(1).getReg(); 6893 bool RegAIsKill = Root.getOperand(1).isKill(); 6894 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 6895 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 6896 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 6897 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 6898 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 6899 6900 unsigned Opcode = Root.getOpcode(); 6901 if (Opcode == AArch64::SUBSWrr) 6902 Opcode = AArch64::SUBWrr; 6903 else if (Opcode == AArch64::SUBSXrr) 6904 Opcode = AArch64::SUBXrr; 6905 else 6906 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 6907 "Unexpected instruction opcode."); 6908 6909 MachineInstrBuilder MIB1 = 6910 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 6911 .addReg(RegA, getKillRegState(RegAIsKill)) 6912 .addReg(RegB, getKillRegState(RegBIsKill)); 6913 MachineInstrBuilder MIB2 = 6914 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 6915 .addReg(NewVR, getKillRegState(true)) 6916 .addReg(RegC, getKillRegState(RegCIsKill)); 6917 6918 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6919 InsInstrs.push_back(MIB1); 6920 InsInstrs.push_back(MIB2); 6921 DelInstrs.push_back(AddMI); 6922 } 6923 6924 /// When getMachineCombinerPatterns() finds potential patterns, 6925 /// this function generates the instructions that could replace the 6926 /// original code sequence 6927 void AArch64InstrInfo::genAlternativeCodeSequence( 6928 MachineInstr &Root, MachineCombinerPattern Pattern, 6929 SmallVectorImpl<MachineInstr *> &InsInstrs, 6930 SmallVectorImpl<MachineInstr *> &DelInstrs, 6931 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 6932 MachineBasicBlock &MBB = *Root.getParent(); 6933 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6934 MachineFunction &MF = *MBB.getParent(); 6935 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6936 6937 MachineInstr *MUL = nullptr; 6938 const TargetRegisterClass *RC; 6939 unsigned Opc; 6940 switch (Pattern) { 6941 default: 6942 // Reassociate instructions. 6943 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 6944 DelInstrs, InstrIdxForVirtReg); 6945 return; 6946 case MachineCombinerPattern::SUBADD_OP1: 6947 // A - (B + C) 6948 // ==> (A - B) - C 6949 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 6950 InstrIdxForVirtReg); 6951 break; 6952 case MachineCombinerPattern::SUBADD_OP2: 6953 // A - (B + C) 6954 // ==> (A - C) - B 6955 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 6956 InstrIdxForVirtReg); 6957 break; 6958 case MachineCombinerPattern::MULADDW_OP1: 6959 case MachineCombinerPattern::MULADDX_OP1: 6960 // MUL I=A,B,0 6961 // ADD R,I,C 6962 // ==> MADD R,A,B,C 6963 // --- Create(MADD); 6964 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 6965 Opc = AArch64::MADDWrrr; 6966 RC = &AArch64::GPR32RegClass; 6967 } else { 6968 Opc = AArch64::MADDXrrr; 6969 RC = &AArch64::GPR64RegClass; 6970 } 6971 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6972 break; 6973 case MachineCombinerPattern::MULADDW_OP2: 6974 case MachineCombinerPattern::MULADDX_OP2: 6975 // MUL I=A,B,0 6976 // ADD R,C,I 6977 // ==> MADD R,A,B,C 6978 // --- Create(MADD); 6979 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 6980 Opc = AArch64::MADDWrrr; 6981 RC = &AArch64::GPR32RegClass; 6982 } else { 6983 Opc = AArch64::MADDXrrr; 6984 RC = &AArch64::GPR64RegClass; 6985 } 6986 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6987 break; 6988 case MachineCombinerPattern::MULADDWI_OP1: 6989 case MachineCombinerPattern::MULADDXI_OP1: { 6990 // MUL I=A,B,0 6991 // ADD R,I,Imm 6992 // ==> MOV V, Imm 6993 // ==> MADD R,A,B,V 6994 // --- Create(MADD); 6995 const TargetRegisterClass *OrrRC; 6996 unsigned BitSize, OrrOpc, ZeroReg; 6997 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 6998 OrrOpc = AArch64::ORRWri; 6999 OrrRC = &AArch64::GPR32spRegClass; 7000 BitSize = 32; 7001 ZeroReg = AArch64::WZR; 7002 Opc = AArch64::MADDWrrr; 7003 RC = &AArch64::GPR32RegClass; 7004 } else { 7005 OrrOpc = AArch64::ORRXri; 7006 OrrRC = &AArch64::GPR64spRegClass; 7007 BitSize = 64; 7008 ZeroReg = AArch64::XZR; 7009 Opc = AArch64::MADDXrrr; 7010 RC = &AArch64::GPR64RegClass; 7011 } 7012 Register NewVR = MRI.createVirtualRegister(OrrRC); 7013 uint64_t Imm = Root.getOperand(2).getImm(); 7014 7015 if (Root.getOperand(3).isImm()) { 7016 unsigned Val = Root.getOperand(3).getImm(); 7017 Imm = Imm << Val; 7018 } 7019 uint64_t UImm = SignExtend64(Imm, BitSize); 7020 // The immediate can be composed via a single instruction. 7021 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7022 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7023 if (Insn.size() != 1) 7024 return; 7025 auto MovI = Insn.begin(); 7026 MachineInstrBuilder MIB1; 7027 // MOV is an alias for one of three instructions: movz, movn, and orr. 7028 if (MovI->Opcode == OrrOpc) 7029 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7030 .addReg(ZeroReg) 7031 .addImm(MovI->Op2); 7032 else { 7033 if (BitSize == 32) 7034 assert((MovI->Opcode == AArch64::MOVNWi || 7035 MovI->Opcode == AArch64::MOVZWi) && 7036 "Expected opcode"); 7037 else 7038 assert((MovI->Opcode == AArch64::MOVNXi || 7039 MovI->Opcode == AArch64::MOVZXi) && 7040 "Expected opcode"); 7041 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7042 .addImm(MovI->Op1) 7043 .addImm(MovI->Op2); 7044 } 7045 InsInstrs.push_back(MIB1); 7046 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7047 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7048 break; 7049 } 7050 case MachineCombinerPattern::MULSUBW_OP1: 7051 case MachineCombinerPattern::MULSUBX_OP1: { 7052 // MUL I=A,B,0 7053 // SUB R,I, C 7054 // ==> SUB V, 0, C 7055 // ==> MADD R,A,B,V // = -C + A*B 7056 // --- Create(MADD); 7057 const TargetRegisterClass *SubRC; 7058 unsigned SubOpc, ZeroReg; 7059 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 7060 SubOpc = AArch64::SUBWrr; 7061 SubRC = &AArch64::GPR32spRegClass; 7062 ZeroReg = AArch64::WZR; 7063 Opc = AArch64::MADDWrrr; 7064 RC = &AArch64::GPR32RegClass; 7065 } else { 7066 SubOpc = AArch64::SUBXrr; 7067 SubRC = &AArch64::GPR64spRegClass; 7068 ZeroReg = AArch64::XZR; 7069 Opc = AArch64::MADDXrrr; 7070 RC = &AArch64::GPR64RegClass; 7071 } 7072 Register NewVR = MRI.createVirtualRegister(SubRC); 7073 // SUB NewVR, 0, C 7074 MachineInstrBuilder MIB1 = 7075 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 7076 .addReg(ZeroReg) 7077 .add(Root.getOperand(2)); 7078 InsInstrs.push_back(MIB1); 7079 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7080 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7081 break; 7082 } 7083 case MachineCombinerPattern::MULSUBW_OP2: 7084 case MachineCombinerPattern::MULSUBX_OP2: 7085 // MUL I=A,B,0 7086 // SUB R,C,I 7087 // ==> MSUB R,A,B,C (computes C - A*B) 7088 // --- Create(MSUB); 7089 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 7090 Opc = AArch64::MSUBWrrr; 7091 RC = &AArch64::GPR32RegClass; 7092 } else { 7093 Opc = AArch64::MSUBXrrr; 7094 RC = &AArch64::GPR64RegClass; 7095 } 7096 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7097 break; 7098 case MachineCombinerPattern::MULSUBWI_OP1: 7099 case MachineCombinerPattern::MULSUBXI_OP1: { 7100 // MUL I=A,B,0 7101 // SUB R,I, Imm 7102 // ==> MOV V, -Imm 7103 // ==> MADD R,A,B,V // = -Imm + A*B 7104 // --- Create(MADD); 7105 const TargetRegisterClass *OrrRC; 7106 unsigned BitSize, OrrOpc, ZeroReg; 7107 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 7108 OrrOpc = AArch64::ORRWri; 7109 OrrRC = &AArch64::GPR32spRegClass; 7110 BitSize = 32; 7111 ZeroReg = AArch64::WZR; 7112 Opc = AArch64::MADDWrrr; 7113 RC = &AArch64::GPR32RegClass; 7114 } else { 7115 OrrOpc = AArch64::ORRXri; 7116 OrrRC = &AArch64::GPR64spRegClass; 7117 BitSize = 64; 7118 ZeroReg = AArch64::XZR; 7119 Opc = AArch64::MADDXrrr; 7120 RC = &AArch64::GPR64RegClass; 7121 } 7122 Register NewVR = MRI.createVirtualRegister(OrrRC); 7123 uint64_t Imm = Root.getOperand(2).getImm(); 7124 if (Root.getOperand(3).isImm()) { 7125 unsigned Val = Root.getOperand(3).getImm(); 7126 Imm = Imm << Val; 7127 } 7128 uint64_t UImm = SignExtend64(-Imm, BitSize); 7129 // The immediate can be composed via a single instruction. 7130 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7131 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7132 if (Insn.size() != 1) 7133 return; 7134 auto MovI = Insn.begin(); 7135 MachineInstrBuilder MIB1; 7136 // MOV is an alias for one of three instructions: movz, movn, and orr. 7137 if (MovI->Opcode == OrrOpc) 7138 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7139 .addReg(ZeroReg) 7140 .addImm(MovI->Op2); 7141 else { 7142 if (BitSize == 32) 7143 assert((MovI->Opcode == AArch64::MOVNWi || 7144 MovI->Opcode == AArch64::MOVZWi) && 7145 "Expected opcode"); 7146 else 7147 assert((MovI->Opcode == AArch64::MOVNXi || 7148 MovI->Opcode == AArch64::MOVZXi) && 7149 "Expected opcode"); 7150 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7151 .addImm(MovI->Op1) 7152 .addImm(MovI->Op2); 7153 } 7154 InsInstrs.push_back(MIB1); 7155 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7156 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7157 break; 7158 } 7159 7160 case MachineCombinerPattern::MULADDv8i8_OP1: 7161 Opc = AArch64::MLAv8i8; 7162 RC = &AArch64::FPR64RegClass; 7163 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7164 break; 7165 case MachineCombinerPattern::MULADDv8i8_OP2: 7166 Opc = AArch64::MLAv8i8; 7167 RC = &AArch64::FPR64RegClass; 7168 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7169 break; 7170 case MachineCombinerPattern::MULADDv16i8_OP1: 7171 Opc = AArch64::MLAv16i8; 7172 RC = &AArch64::FPR128RegClass; 7173 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7174 break; 7175 case MachineCombinerPattern::MULADDv16i8_OP2: 7176 Opc = AArch64::MLAv16i8; 7177 RC = &AArch64::FPR128RegClass; 7178 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7179 break; 7180 case MachineCombinerPattern::MULADDv4i16_OP1: 7181 Opc = AArch64::MLAv4i16; 7182 RC = &AArch64::FPR64RegClass; 7183 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7184 break; 7185 case MachineCombinerPattern::MULADDv4i16_OP2: 7186 Opc = AArch64::MLAv4i16; 7187 RC = &AArch64::FPR64RegClass; 7188 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7189 break; 7190 case MachineCombinerPattern::MULADDv8i16_OP1: 7191 Opc = AArch64::MLAv8i16; 7192 RC = &AArch64::FPR128RegClass; 7193 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7194 break; 7195 case MachineCombinerPattern::MULADDv8i16_OP2: 7196 Opc = AArch64::MLAv8i16; 7197 RC = &AArch64::FPR128RegClass; 7198 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7199 break; 7200 case MachineCombinerPattern::MULADDv2i32_OP1: 7201 Opc = AArch64::MLAv2i32; 7202 RC = &AArch64::FPR64RegClass; 7203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7204 break; 7205 case MachineCombinerPattern::MULADDv2i32_OP2: 7206 Opc = AArch64::MLAv2i32; 7207 RC = &AArch64::FPR64RegClass; 7208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7209 break; 7210 case MachineCombinerPattern::MULADDv4i32_OP1: 7211 Opc = AArch64::MLAv4i32; 7212 RC = &AArch64::FPR128RegClass; 7213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7214 break; 7215 case MachineCombinerPattern::MULADDv4i32_OP2: 7216 Opc = AArch64::MLAv4i32; 7217 RC = &AArch64::FPR128RegClass; 7218 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7219 break; 7220 7221 case MachineCombinerPattern::MULSUBv8i8_OP1: 7222 Opc = AArch64::MLAv8i8; 7223 RC = &AArch64::FPR64RegClass; 7224 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7225 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 7226 RC); 7227 break; 7228 case MachineCombinerPattern::MULSUBv8i8_OP2: 7229 Opc = AArch64::MLSv8i8; 7230 RC = &AArch64::FPR64RegClass; 7231 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7232 break; 7233 case MachineCombinerPattern::MULSUBv16i8_OP1: 7234 Opc = AArch64::MLAv16i8; 7235 RC = &AArch64::FPR128RegClass; 7236 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7237 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 7238 RC); 7239 break; 7240 case MachineCombinerPattern::MULSUBv16i8_OP2: 7241 Opc = AArch64::MLSv16i8; 7242 RC = &AArch64::FPR128RegClass; 7243 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7244 break; 7245 case MachineCombinerPattern::MULSUBv4i16_OP1: 7246 Opc = AArch64::MLAv4i16; 7247 RC = &AArch64::FPR64RegClass; 7248 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7249 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7250 RC); 7251 break; 7252 case MachineCombinerPattern::MULSUBv4i16_OP2: 7253 Opc = AArch64::MLSv4i16; 7254 RC = &AArch64::FPR64RegClass; 7255 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7256 break; 7257 case MachineCombinerPattern::MULSUBv8i16_OP1: 7258 Opc = AArch64::MLAv8i16; 7259 RC = &AArch64::FPR128RegClass; 7260 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7261 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7262 RC); 7263 break; 7264 case MachineCombinerPattern::MULSUBv8i16_OP2: 7265 Opc = AArch64::MLSv8i16; 7266 RC = &AArch64::FPR128RegClass; 7267 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7268 break; 7269 case MachineCombinerPattern::MULSUBv2i32_OP1: 7270 Opc = AArch64::MLAv2i32; 7271 RC = &AArch64::FPR64RegClass; 7272 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7273 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7274 RC); 7275 break; 7276 case MachineCombinerPattern::MULSUBv2i32_OP2: 7277 Opc = AArch64::MLSv2i32; 7278 RC = &AArch64::FPR64RegClass; 7279 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7280 break; 7281 case MachineCombinerPattern::MULSUBv4i32_OP1: 7282 Opc = AArch64::MLAv4i32; 7283 RC = &AArch64::FPR128RegClass; 7284 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7285 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7286 RC); 7287 break; 7288 case MachineCombinerPattern::MULSUBv4i32_OP2: 7289 Opc = AArch64::MLSv4i32; 7290 RC = &AArch64::FPR128RegClass; 7291 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7292 break; 7293 7294 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 7295 Opc = AArch64::MLAv4i16_indexed; 7296 RC = &AArch64::FPR64RegClass; 7297 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7298 break; 7299 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 7300 Opc = AArch64::MLAv4i16_indexed; 7301 RC = &AArch64::FPR64RegClass; 7302 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7303 break; 7304 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 7305 Opc = AArch64::MLAv8i16_indexed; 7306 RC = &AArch64::FPR128RegClass; 7307 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7308 break; 7309 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 7310 Opc = AArch64::MLAv8i16_indexed; 7311 RC = &AArch64::FPR128RegClass; 7312 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7313 break; 7314 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 7315 Opc = AArch64::MLAv2i32_indexed; 7316 RC = &AArch64::FPR64RegClass; 7317 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7318 break; 7319 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 7320 Opc = AArch64::MLAv2i32_indexed; 7321 RC = &AArch64::FPR64RegClass; 7322 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7323 break; 7324 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 7325 Opc = AArch64::MLAv4i32_indexed; 7326 RC = &AArch64::FPR128RegClass; 7327 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7328 break; 7329 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 7330 Opc = AArch64::MLAv4i32_indexed; 7331 RC = &AArch64::FPR128RegClass; 7332 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7333 break; 7334 7335 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 7336 Opc = AArch64::MLAv4i16_indexed; 7337 RC = &AArch64::FPR64RegClass; 7338 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7339 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7340 RC); 7341 break; 7342 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 7343 Opc = AArch64::MLSv4i16_indexed; 7344 RC = &AArch64::FPR64RegClass; 7345 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7346 break; 7347 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 7348 Opc = AArch64::MLAv8i16_indexed; 7349 RC = &AArch64::FPR128RegClass; 7350 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7351 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7352 RC); 7353 break; 7354 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 7355 Opc = AArch64::MLSv8i16_indexed; 7356 RC = &AArch64::FPR128RegClass; 7357 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7358 break; 7359 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 7360 Opc = AArch64::MLAv2i32_indexed; 7361 RC = &AArch64::FPR64RegClass; 7362 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7363 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7364 RC); 7365 break; 7366 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 7367 Opc = AArch64::MLSv2i32_indexed; 7368 RC = &AArch64::FPR64RegClass; 7369 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7370 break; 7371 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 7372 Opc = AArch64::MLAv4i32_indexed; 7373 RC = &AArch64::FPR128RegClass; 7374 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7375 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7376 RC); 7377 break; 7378 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 7379 Opc = AArch64::MLSv4i32_indexed; 7380 RC = &AArch64::FPR128RegClass; 7381 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7382 break; 7383 7384 // Floating Point Support 7385 case MachineCombinerPattern::FMULADDH_OP1: 7386 Opc = AArch64::FMADDHrrr; 7387 RC = &AArch64::FPR16RegClass; 7388 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7389 break; 7390 case MachineCombinerPattern::FMULADDS_OP1: 7391 Opc = AArch64::FMADDSrrr; 7392 RC = &AArch64::FPR32RegClass; 7393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7394 break; 7395 case MachineCombinerPattern::FMULADDD_OP1: 7396 Opc = AArch64::FMADDDrrr; 7397 RC = &AArch64::FPR64RegClass; 7398 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7399 break; 7400 7401 case MachineCombinerPattern::FMULADDH_OP2: 7402 Opc = AArch64::FMADDHrrr; 7403 RC = &AArch64::FPR16RegClass; 7404 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7405 break; 7406 case MachineCombinerPattern::FMULADDS_OP2: 7407 Opc = AArch64::FMADDSrrr; 7408 RC = &AArch64::FPR32RegClass; 7409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7410 break; 7411 case MachineCombinerPattern::FMULADDD_OP2: 7412 Opc = AArch64::FMADDDrrr; 7413 RC = &AArch64::FPR64RegClass; 7414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7415 break; 7416 7417 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 7418 Opc = AArch64::FMLAv1i32_indexed; 7419 RC = &AArch64::FPR32RegClass; 7420 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7421 FMAInstKind::Indexed); 7422 break; 7423 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 7424 Opc = AArch64::FMLAv1i32_indexed; 7425 RC = &AArch64::FPR32RegClass; 7426 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7427 FMAInstKind::Indexed); 7428 break; 7429 7430 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 7431 Opc = AArch64::FMLAv1i64_indexed; 7432 RC = &AArch64::FPR64RegClass; 7433 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7434 FMAInstKind::Indexed); 7435 break; 7436 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 7437 Opc = AArch64::FMLAv1i64_indexed; 7438 RC = &AArch64::FPR64RegClass; 7439 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7440 FMAInstKind::Indexed); 7441 break; 7442 7443 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 7444 RC = &AArch64::FPR64RegClass; 7445 Opc = AArch64::FMLAv4i16_indexed; 7446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7447 FMAInstKind::Indexed); 7448 break; 7449 case MachineCombinerPattern::FMLAv4f16_OP1: 7450 RC = &AArch64::FPR64RegClass; 7451 Opc = AArch64::FMLAv4f16; 7452 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7453 FMAInstKind::Accumulator); 7454 break; 7455 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 7456 RC = &AArch64::FPR64RegClass; 7457 Opc = AArch64::FMLAv4i16_indexed; 7458 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7459 FMAInstKind::Indexed); 7460 break; 7461 case MachineCombinerPattern::FMLAv4f16_OP2: 7462 RC = &AArch64::FPR64RegClass; 7463 Opc = AArch64::FMLAv4f16; 7464 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7465 FMAInstKind::Accumulator); 7466 break; 7467 7468 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 7469 case MachineCombinerPattern::FMLAv2f32_OP1: 7470 RC = &AArch64::FPR64RegClass; 7471 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 7472 Opc = AArch64::FMLAv2i32_indexed; 7473 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7474 FMAInstKind::Indexed); 7475 } else { 7476 Opc = AArch64::FMLAv2f32; 7477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7478 FMAInstKind::Accumulator); 7479 } 7480 break; 7481 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 7482 case MachineCombinerPattern::FMLAv2f32_OP2: 7483 RC = &AArch64::FPR64RegClass; 7484 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 7485 Opc = AArch64::FMLAv2i32_indexed; 7486 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7487 FMAInstKind::Indexed); 7488 } else { 7489 Opc = AArch64::FMLAv2f32; 7490 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7491 FMAInstKind::Accumulator); 7492 } 7493 break; 7494 7495 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 7496 RC = &AArch64::FPR128RegClass; 7497 Opc = AArch64::FMLAv8i16_indexed; 7498 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7499 FMAInstKind::Indexed); 7500 break; 7501 case MachineCombinerPattern::FMLAv8f16_OP1: 7502 RC = &AArch64::FPR128RegClass; 7503 Opc = AArch64::FMLAv8f16; 7504 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7505 FMAInstKind::Accumulator); 7506 break; 7507 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 7508 RC = &AArch64::FPR128RegClass; 7509 Opc = AArch64::FMLAv8i16_indexed; 7510 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7511 FMAInstKind::Indexed); 7512 break; 7513 case MachineCombinerPattern::FMLAv8f16_OP2: 7514 RC = &AArch64::FPR128RegClass; 7515 Opc = AArch64::FMLAv8f16; 7516 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7517 FMAInstKind::Accumulator); 7518 break; 7519 7520 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 7521 case MachineCombinerPattern::FMLAv2f64_OP1: 7522 RC = &AArch64::FPR128RegClass; 7523 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 7524 Opc = AArch64::FMLAv2i64_indexed; 7525 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7526 FMAInstKind::Indexed); 7527 } else { 7528 Opc = AArch64::FMLAv2f64; 7529 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7530 FMAInstKind::Accumulator); 7531 } 7532 break; 7533 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 7534 case MachineCombinerPattern::FMLAv2f64_OP2: 7535 RC = &AArch64::FPR128RegClass; 7536 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 7537 Opc = AArch64::FMLAv2i64_indexed; 7538 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7539 FMAInstKind::Indexed); 7540 } else { 7541 Opc = AArch64::FMLAv2f64; 7542 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7543 FMAInstKind::Accumulator); 7544 } 7545 break; 7546 7547 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 7548 case MachineCombinerPattern::FMLAv4f32_OP1: 7549 RC = &AArch64::FPR128RegClass; 7550 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 7551 Opc = AArch64::FMLAv4i32_indexed; 7552 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7553 FMAInstKind::Indexed); 7554 } else { 7555 Opc = AArch64::FMLAv4f32; 7556 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7557 FMAInstKind::Accumulator); 7558 } 7559 break; 7560 7561 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 7562 case MachineCombinerPattern::FMLAv4f32_OP2: 7563 RC = &AArch64::FPR128RegClass; 7564 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 7565 Opc = AArch64::FMLAv4i32_indexed; 7566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7567 FMAInstKind::Indexed); 7568 } else { 7569 Opc = AArch64::FMLAv4f32; 7570 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7571 FMAInstKind::Accumulator); 7572 } 7573 break; 7574 7575 case MachineCombinerPattern::FMULSUBH_OP1: 7576 Opc = AArch64::FNMSUBHrrr; 7577 RC = &AArch64::FPR16RegClass; 7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7579 break; 7580 case MachineCombinerPattern::FMULSUBS_OP1: 7581 Opc = AArch64::FNMSUBSrrr; 7582 RC = &AArch64::FPR32RegClass; 7583 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7584 break; 7585 case MachineCombinerPattern::FMULSUBD_OP1: 7586 Opc = AArch64::FNMSUBDrrr; 7587 RC = &AArch64::FPR64RegClass; 7588 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7589 break; 7590 7591 case MachineCombinerPattern::FNMULSUBH_OP1: 7592 Opc = AArch64::FNMADDHrrr; 7593 RC = &AArch64::FPR16RegClass; 7594 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7595 break; 7596 case MachineCombinerPattern::FNMULSUBS_OP1: 7597 Opc = AArch64::FNMADDSrrr; 7598 RC = &AArch64::FPR32RegClass; 7599 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7600 break; 7601 case MachineCombinerPattern::FNMULSUBD_OP1: 7602 Opc = AArch64::FNMADDDrrr; 7603 RC = &AArch64::FPR64RegClass; 7604 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7605 break; 7606 7607 case MachineCombinerPattern::FMULSUBH_OP2: 7608 Opc = AArch64::FMSUBHrrr; 7609 RC = &AArch64::FPR16RegClass; 7610 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7611 break; 7612 case MachineCombinerPattern::FMULSUBS_OP2: 7613 Opc = AArch64::FMSUBSrrr; 7614 RC = &AArch64::FPR32RegClass; 7615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7616 break; 7617 case MachineCombinerPattern::FMULSUBD_OP2: 7618 Opc = AArch64::FMSUBDrrr; 7619 RC = &AArch64::FPR64RegClass; 7620 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7621 break; 7622 7623 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 7624 Opc = AArch64::FMLSv1i32_indexed; 7625 RC = &AArch64::FPR32RegClass; 7626 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7627 FMAInstKind::Indexed); 7628 break; 7629 7630 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 7631 Opc = AArch64::FMLSv1i64_indexed; 7632 RC = &AArch64::FPR64RegClass; 7633 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7634 FMAInstKind::Indexed); 7635 break; 7636 7637 case MachineCombinerPattern::FMLSv4f16_OP1: 7638 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 7639 RC = &AArch64::FPR64RegClass; 7640 Register NewVR = MRI.createVirtualRegister(RC); 7641 MachineInstrBuilder MIB1 = 7642 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 7643 .add(Root.getOperand(2)); 7644 InsInstrs.push_back(MIB1); 7645 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7646 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 7647 Opc = AArch64::FMLAv4f16; 7648 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7649 FMAInstKind::Accumulator, &NewVR); 7650 } else { 7651 Opc = AArch64::FMLAv4i16_indexed; 7652 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7653 FMAInstKind::Indexed, &NewVR); 7654 } 7655 break; 7656 } 7657 case MachineCombinerPattern::FMLSv4f16_OP2: 7658 RC = &AArch64::FPR64RegClass; 7659 Opc = AArch64::FMLSv4f16; 7660 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7661 FMAInstKind::Accumulator); 7662 break; 7663 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 7664 RC = &AArch64::FPR64RegClass; 7665 Opc = AArch64::FMLSv4i16_indexed; 7666 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7667 FMAInstKind::Indexed); 7668 break; 7669 7670 case MachineCombinerPattern::FMLSv2f32_OP2: 7671 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 7672 RC = &AArch64::FPR64RegClass; 7673 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 7674 Opc = AArch64::FMLSv2i32_indexed; 7675 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7676 FMAInstKind::Indexed); 7677 } else { 7678 Opc = AArch64::FMLSv2f32; 7679 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7680 FMAInstKind::Accumulator); 7681 } 7682 break; 7683 7684 case MachineCombinerPattern::FMLSv8f16_OP1: 7685 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 7686 RC = &AArch64::FPR128RegClass; 7687 Register NewVR = MRI.createVirtualRegister(RC); 7688 MachineInstrBuilder MIB1 = 7689 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 7690 .add(Root.getOperand(2)); 7691 InsInstrs.push_back(MIB1); 7692 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7693 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 7694 Opc = AArch64::FMLAv8f16; 7695 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7696 FMAInstKind::Accumulator, &NewVR); 7697 } else { 7698 Opc = AArch64::FMLAv8i16_indexed; 7699 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7700 FMAInstKind::Indexed, &NewVR); 7701 } 7702 break; 7703 } 7704 case MachineCombinerPattern::FMLSv8f16_OP2: 7705 RC = &AArch64::FPR128RegClass; 7706 Opc = AArch64::FMLSv8f16; 7707 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7708 FMAInstKind::Accumulator); 7709 break; 7710 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 7711 RC = &AArch64::FPR128RegClass; 7712 Opc = AArch64::FMLSv8i16_indexed; 7713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7714 FMAInstKind::Indexed); 7715 break; 7716 7717 case MachineCombinerPattern::FMLSv2f64_OP2: 7718 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 7719 RC = &AArch64::FPR128RegClass; 7720 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 7721 Opc = AArch64::FMLSv2i64_indexed; 7722 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7723 FMAInstKind::Indexed); 7724 } else { 7725 Opc = AArch64::FMLSv2f64; 7726 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7727 FMAInstKind::Accumulator); 7728 } 7729 break; 7730 7731 case MachineCombinerPattern::FMLSv4f32_OP2: 7732 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 7733 RC = &AArch64::FPR128RegClass; 7734 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 7735 Opc = AArch64::FMLSv4i32_indexed; 7736 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7737 FMAInstKind::Indexed); 7738 } else { 7739 Opc = AArch64::FMLSv4f32; 7740 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7741 FMAInstKind::Accumulator); 7742 } 7743 break; 7744 case MachineCombinerPattern::FMLSv2f32_OP1: 7745 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 7746 RC = &AArch64::FPR64RegClass; 7747 Register NewVR = MRI.createVirtualRegister(RC); 7748 MachineInstrBuilder MIB1 = 7749 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 7750 .add(Root.getOperand(2)); 7751 InsInstrs.push_back(MIB1); 7752 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7753 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 7754 Opc = AArch64::FMLAv2i32_indexed; 7755 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7756 FMAInstKind::Indexed, &NewVR); 7757 } else { 7758 Opc = AArch64::FMLAv2f32; 7759 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7760 FMAInstKind::Accumulator, &NewVR); 7761 } 7762 break; 7763 } 7764 case MachineCombinerPattern::FMLSv4f32_OP1: 7765 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 7766 RC = &AArch64::FPR128RegClass; 7767 Register NewVR = MRI.createVirtualRegister(RC); 7768 MachineInstrBuilder MIB1 = 7769 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 7770 .add(Root.getOperand(2)); 7771 InsInstrs.push_back(MIB1); 7772 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7773 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 7774 Opc = AArch64::FMLAv4i32_indexed; 7775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7776 FMAInstKind::Indexed, &NewVR); 7777 } else { 7778 Opc = AArch64::FMLAv4f32; 7779 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7780 FMAInstKind::Accumulator, &NewVR); 7781 } 7782 break; 7783 } 7784 case MachineCombinerPattern::FMLSv2f64_OP1: 7785 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 7786 RC = &AArch64::FPR128RegClass; 7787 Register NewVR = MRI.createVirtualRegister(RC); 7788 MachineInstrBuilder MIB1 = 7789 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 7790 .add(Root.getOperand(2)); 7791 InsInstrs.push_back(MIB1); 7792 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7793 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 7794 Opc = AArch64::FMLAv2i64_indexed; 7795 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7796 FMAInstKind::Indexed, &NewVR); 7797 } else { 7798 Opc = AArch64::FMLAv2f64; 7799 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7800 FMAInstKind::Accumulator, &NewVR); 7801 } 7802 break; 7803 } 7804 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 7805 case MachineCombinerPattern::FMULv2i32_indexed_OP2: { 7806 unsigned IdxDupOp = 7807 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; 7808 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 7809 &AArch64::FPR128RegClass, MRI); 7810 break; 7811 } 7812 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 7813 case MachineCombinerPattern::FMULv2i64_indexed_OP2: { 7814 unsigned IdxDupOp = 7815 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; 7816 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 7817 &AArch64::FPR128RegClass, MRI); 7818 break; 7819 } 7820 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 7821 case MachineCombinerPattern::FMULv4i16_indexed_OP2: { 7822 unsigned IdxDupOp = 7823 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; 7824 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 7825 &AArch64::FPR128_loRegClass, MRI); 7826 break; 7827 } 7828 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 7829 case MachineCombinerPattern::FMULv4i32_indexed_OP2: { 7830 unsigned IdxDupOp = 7831 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; 7832 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 7833 &AArch64::FPR128RegClass, MRI); 7834 break; 7835 } 7836 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 7837 case MachineCombinerPattern::FMULv8i16_indexed_OP2: { 7838 unsigned IdxDupOp = 7839 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; 7840 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 7841 &AArch64::FPR128_loRegClass, MRI); 7842 break; 7843 } 7844 case MachineCombinerPattern::FNMADD: { 7845 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 7846 break; 7847 } 7848 7849 } // end switch (Pattern) 7850 // Record MUL and ADD/SUB for deletion 7851 if (MUL) 7852 DelInstrs.push_back(MUL); 7853 DelInstrs.push_back(&Root); 7854 7855 // Set the flags on the inserted instructions to be the merged flags of the 7856 // instructions that we have combined. 7857 uint32_t Flags = Root.getFlags(); 7858 if (MUL) 7859 Flags = Root.mergeFlagsWith(*MUL); 7860 for (auto *MI : InsInstrs) 7861 MI->setFlags(Flags); 7862 } 7863 7864 /// Replace csincr-branch sequence by simple conditional branch 7865 /// 7866 /// Examples: 7867 /// 1. \code 7868 /// csinc w9, wzr, wzr, <condition code> 7869 /// tbnz w9, #0, 0x44 7870 /// \endcode 7871 /// to 7872 /// \code 7873 /// b.<inverted condition code> 7874 /// \endcode 7875 /// 7876 /// 2. \code 7877 /// csinc w9, wzr, wzr, <condition code> 7878 /// tbz w9, #0, 0x44 7879 /// \endcode 7880 /// to 7881 /// \code 7882 /// b.<condition code> 7883 /// \endcode 7884 /// 7885 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 7886 /// compare's constant operand is power of 2. 7887 /// 7888 /// Examples: 7889 /// \code 7890 /// and w8, w8, #0x400 7891 /// cbnz w8, L1 7892 /// \endcode 7893 /// to 7894 /// \code 7895 /// tbnz w8, #10, L1 7896 /// \endcode 7897 /// 7898 /// \param MI Conditional Branch 7899 /// \return True when the simple conditional branch is generated 7900 /// 7901 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 7902 bool IsNegativeBranch = false; 7903 bool IsTestAndBranch = false; 7904 unsigned TargetBBInMI = 0; 7905 switch (MI.getOpcode()) { 7906 default: 7907 llvm_unreachable("Unknown branch instruction?"); 7908 case AArch64::Bcc: 7909 return false; 7910 case AArch64::CBZW: 7911 case AArch64::CBZX: 7912 TargetBBInMI = 1; 7913 break; 7914 case AArch64::CBNZW: 7915 case AArch64::CBNZX: 7916 TargetBBInMI = 1; 7917 IsNegativeBranch = true; 7918 break; 7919 case AArch64::TBZW: 7920 case AArch64::TBZX: 7921 TargetBBInMI = 2; 7922 IsTestAndBranch = true; 7923 break; 7924 case AArch64::TBNZW: 7925 case AArch64::TBNZX: 7926 TargetBBInMI = 2; 7927 IsNegativeBranch = true; 7928 IsTestAndBranch = true; 7929 break; 7930 } 7931 // So we increment a zero register and test for bits other 7932 // than bit 0? Conservatively bail out in case the verifier 7933 // missed this case. 7934 if (IsTestAndBranch && MI.getOperand(1).getImm()) 7935 return false; 7936 7937 // Find Definition. 7938 assert(MI.getParent() && "Incomplete machine instruciton\n"); 7939 MachineBasicBlock *MBB = MI.getParent(); 7940 MachineFunction *MF = MBB->getParent(); 7941 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7942 Register VReg = MI.getOperand(0).getReg(); 7943 if (!VReg.isVirtual()) 7944 return false; 7945 7946 MachineInstr *DefMI = MRI->getVRegDef(VReg); 7947 7948 // Look through COPY instructions to find definition. 7949 while (DefMI->isCopy()) { 7950 Register CopyVReg = DefMI->getOperand(1).getReg(); 7951 if (!MRI->hasOneNonDBGUse(CopyVReg)) 7952 return false; 7953 if (!MRI->hasOneDef(CopyVReg)) 7954 return false; 7955 DefMI = MRI->getVRegDef(CopyVReg); 7956 } 7957 7958 switch (DefMI->getOpcode()) { 7959 default: 7960 return false; 7961 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 7962 case AArch64::ANDWri: 7963 case AArch64::ANDXri: { 7964 if (IsTestAndBranch) 7965 return false; 7966 if (DefMI->getParent() != MBB) 7967 return false; 7968 if (!MRI->hasOneNonDBGUse(VReg)) 7969 return false; 7970 7971 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 7972 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 7973 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 7974 if (!isPowerOf2_64(Mask)) 7975 return false; 7976 7977 MachineOperand &MO = DefMI->getOperand(1); 7978 Register NewReg = MO.getReg(); 7979 if (!NewReg.isVirtual()) 7980 return false; 7981 7982 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 7983 7984 MachineBasicBlock &RefToMBB = *MBB; 7985 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 7986 DebugLoc DL = MI.getDebugLoc(); 7987 unsigned Imm = Log2_64(Mask); 7988 unsigned Opc = (Imm < 32) 7989 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 7990 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 7991 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 7992 .addReg(NewReg) 7993 .addImm(Imm) 7994 .addMBB(TBB); 7995 // Register lives on to the CBZ now. 7996 MO.setIsKill(false); 7997 7998 // For immediate smaller than 32, we need to use the 32-bit 7999 // variant (W) in all cases. Indeed the 64-bit variant does not 8000 // allow to encode them. 8001 // Therefore, if the input register is 64-bit, we need to take the 8002 // 32-bit sub-part. 8003 if (!Is32Bit && Imm < 32) 8004 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 8005 MI.eraseFromParent(); 8006 return true; 8007 } 8008 // Look for CSINC 8009 case AArch64::CSINCWr: 8010 case AArch64::CSINCXr: { 8011 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 8012 DefMI->getOperand(2).getReg() == AArch64::WZR) && 8013 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 8014 DefMI->getOperand(2).getReg() == AArch64::XZR)) 8015 return false; 8016 8017 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 8018 return false; 8019 8020 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 8021 // Convert only when the condition code is not modified between 8022 // the CSINC and the branch. The CC may be used by other 8023 // instructions in between. 8024 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 8025 return false; 8026 MachineBasicBlock &RefToMBB = *MBB; 8027 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 8028 DebugLoc DL = MI.getDebugLoc(); 8029 if (IsNegativeBranch) 8030 CC = AArch64CC::getInvertedCondCode(CC); 8031 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 8032 MI.eraseFromParent(); 8033 return true; 8034 } 8035 } 8036 } 8037 8038 std::pair<unsigned, unsigned> 8039 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 8040 const unsigned Mask = AArch64II::MO_FRAGMENT; 8041 return std::make_pair(TF & Mask, TF & ~Mask); 8042 } 8043 8044 ArrayRef<std::pair<unsigned, const char *>> 8045 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 8046 using namespace AArch64II; 8047 8048 static const std::pair<unsigned, const char *> TargetFlags[] = { 8049 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 8050 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 8051 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 8052 {MO_HI12, "aarch64-hi12"}}; 8053 return ArrayRef(TargetFlags); 8054 } 8055 8056 ArrayRef<std::pair<unsigned, const char *>> 8057 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 8058 using namespace AArch64II; 8059 8060 static const std::pair<unsigned, const char *> TargetFlags[] = { 8061 {MO_COFFSTUB, "aarch64-coffstub"}, 8062 {MO_GOT, "aarch64-got"}, 8063 {MO_NC, "aarch64-nc"}, 8064 {MO_S, "aarch64-s"}, 8065 {MO_TLS, "aarch64-tls"}, 8066 {MO_DLLIMPORT, "aarch64-dllimport"}, 8067 {MO_DLLIMPORTAUX, "aarch64-dllimportaux"}, 8068 {MO_PREL, "aarch64-prel"}, 8069 {MO_TAGGED, "aarch64-tagged"}}; 8070 return ArrayRef(TargetFlags); 8071 } 8072 8073 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 8074 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 8075 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 8076 {{MOSuppressPair, "aarch64-suppress-pair"}, 8077 {MOStridedAccess, "aarch64-strided-access"}}; 8078 return ArrayRef(TargetFlags); 8079 } 8080 8081 /// Constants defining how certain sequences should be outlined. 8082 /// This encompasses how an outlined function should be called, and what kind of 8083 /// frame should be emitted for that outlined function. 8084 /// 8085 /// \p MachineOutlinerDefault implies that the function should be called with 8086 /// a save and restore of LR to the stack. 8087 /// 8088 /// That is, 8089 /// 8090 /// I1 Save LR OUTLINED_FUNCTION: 8091 /// I2 --> BL OUTLINED_FUNCTION I1 8092 /// I3 Restore LR I2 8093 /// I3 8094 /// RET 8095 /// 8096 /// * Call construction overhead: 3 (save + BL + restore) 8097 /// * Frame construction overhead: 1 (ret) 8098 /// * Requires stack fixups? Yes 8099 /// 8100 /// \p MachineOutlinerTailCall implies that the function is being created from 8101 /// a sequence of instructions ending in a return. 8102 /// 8103 /// That is, 8104 /// 8105 /// I1 OUTLINED_FUNCTION: 8106 /// I2 --> B OUTLINED_FUNCTION I1 8107 /// RET I2 8108 /// RET 8109 /// 8110 /// * Call construction overhead: 1 (B) 8111 /// * Frame construction overhead: 0 (Return included in sequence) 8112 /// * Requires stack fixups? No 8113 /// 8114 /// \p MachineOutlinerNoLRSave implies that the function should be called using 8115 /// a BL instruction, but doesn't require LR to be saved and restored. This 8116 /// happens when LR is known to be dead. 8117 /// 8118 /// That is, 8119 /// 8120 /// I1 OUTLINED_FUNCTION: 8121 /// I2 --> BL OUTLINED_FUNCTION I1 8122 /// I3 I2 8123 /// I3 8124 /// RET 8125 /// 8126 /// * Call construction overhead: 1 (BL) 8127 /// * Frame construction overhead: 1 (RET) 8128 /// * Requires stack fixups? No 8129 /// 8130 /// \p MachineOutlinerThunk implies that the function is being created from 8131 /// a sequence of instructions ending in a call. The outlined function is 8132 /// called with a BL instruction, and the outlined function tail-calls the 8133 /// original call destination. 8134 /// 8135 /// That is, 8136 /// 8137 /// I1 OUTLINED_FUNCTION: 8138 /// I2 --> BL OUTLINED_FUNCTION I1 8139 /// BL f I2 8140 /// B f 8141 /// * Call construction overhead: 1 (BL) 8142 /// * Frame construction overhead: 0 8143 /// * Requires stack fixups? No 8144 /// 8145 /// \p MachineOutlinerRegSave implies that the function should be called with a 8146 /// save and restore of LR to an available register. This allows us to avoid 8147 /// stack fixups. Note that this outlining variant is compatible with the 8148 /// NoLRSave case. 8149 /// 8150 /// That is, 8151 /// 8152 /// I1 Save LR OUTLINED_FUNCTION: 8153 /// I2 --> BL OUTLINED_FUNCTION I1 8154 /// I3 Restore LR I2 8155 /// I3 8156 /// RET 8157 /// 8158 /// * Call construction overhead: 3 (save + BL + restore) 8159 /// * Frame construction overhead: 1 (ret) 8160 /// * Requires stack fixups? No 8161 enum MachineOutlinerClass { 8162 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 8163 MachineOutlinerTailCall, /// Only emit a branch. 8164 MachineOutlinerNoLRSave, /// Emit a call and return. 8165 MachineOutlinerThunk, /// Emit a call and tail-call. 8166 MachineOutlinerRegSave /// Same as default, but save to a register. 8167 }; 8168 8169 enum MachineOutlinerMBBFlags { 8170 LRUnavailableSomewhere = 0x2, 8171 HasCalls = 0x4, 8172 UnsafeRegsDead = 0x8 8173 }; 8174 8175 Register 8176 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 8177 MachineFunction *MF = C.getMF(); 8178 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 8179 const AArch64RegisterInfo *ARI = 8180 static_cast<const AArch64RegisterInfo *>(&TRI); 8181 // Check if there is an available register across the sequence that we can 8182 // use. 8183 for (unsigned Reg : AArch64::GPR64RegClass) { 8184 if (!ARI->isReservedReg(*MF, Reg) && 8185 Reg != AArch64::LR && // LR is not reserved, but don't use it. 8186 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 8187 Reg != AArch64::X17 && // Ditto for X17. 8188 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 8189 C.isAvailableInsideSeq(Reg, TRI)) 8190 return Reg; 8191 } 8192 return Register(); 8193 } 8194 8195 static bool 8196 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 8197 const outliner::Candidate &b) { 8198 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8199 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8200 8201 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 8202 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 8203 } 8204 8205 static bool 8206 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 8207 const outliner::Candidate &b) { 8208 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8209 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8210 8211 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 8212 } 8213 8214 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 8215 const outliner::Candidate &b) { 8216 const AArch64Subtarget &SubtargetA = 8217 a.getMF()->getSubtarget<AArch64Subtarget>(); 8218 const AArch64Subtarget &SubtargetB = 8219 b.getMF()->getSubtarget<AArch64Subtarget>(); 8220 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 8221 } 8222 8223 std::optional<outliner::OutlinedFunction> 8224 AArch64InstrInfo::getOutliningCandidateInfo( 8225 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 8226 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 8227 unsigned SequenceSize = 8228 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 8229 [this](unsigned Sum, const MachineInstr &MI) { 8230 return Sum + getInstSizeInBytes(MI); 8231 }); 8232 unsigned NumBytesToCreateFrame = 0; 8233 8234 // We only allow outlining for functions having exactly matching return 8235 // address signing attributes, i.e., all share the same value for the 8236 // attribute "sign-return-address" and all share the same type of key they 8237 // are signed with. 8238 // Additionally we require all functions to simultaniously either support 8239 // v8.3a features or not. Otherwise an outlined function could get signed 8240 // using dedicated v8.3 instructions and a call from a function that doesn't 8241 // support v8.3 instructions would therefore be invalid. 8242 if (std::adjacent_find( 8243 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 8244 [](const outliner::Candidate &a, const outliner::Candidate &b) { 8245 // Return true if a and b are non-equal w.r.t. return address 8246 // signing or support of v8.3a features 8247 if (outliningCandidatesSigningScopeConsensus(a, b) && 8248 outliningCandidatesSigningKeyConsensus(a, b) && 8249 outliningCandidatesV8_3OpsConsensus(a, b)) { 8250 return false; 8251 } 8252 return true; 8253 }) != RepeatedSequenceLocs.end()) { 8254 return std::nullopt; 8255 } 8256 8257 // Since at this point all candidates agree on their return address signing 8258 // picking just one is fine. If the candidate functions potentially sign their 8259 // return addresses, the outlined function should do the same. Note that in 8260 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 8261 // not certainly true that the outlined function will have to sign its return 8262 // address but this decision is made later, when the decision to outline 8263 // has already been made. 8264 // The same holds for the number of additional instructions we need: On 8265 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 8266 // necessary. However, at this point we don't know if the outlined function 8267 // will have a RET instruction so we assume the worst. 8268 const TargetRegisterInfo &TRI = getRegisterInfo(); 8269 // Performing a tail call may require extra checks when PAuth is enabled. 8270 // If PAuth is disabled, set it to zero for uniformity. 8271 unsigned NumBytesToCheckLRInTCEpilogue = 0; 8272 if (FirstCand.getMF() 8273 ->getInfo<AArch64FunctionInfo>() 8274 ->shouldSignReturnAddress(true)) { 8275 // One PAC and one AUT instructions 8276 NumBytesToCreateFrame += 8; 8277 8278 // PAuth is enabled - set extra tail call cost, if any. 8279 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(); 8280 NumBytesToCheckLRInTCEpilogue = 8281 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod); 8282 // Checking the authenticated LR value may significantly impact 8283 // SequenceSize, so account for it for more precise results. 8284 if (isTailCallReturnInst(*RepeatedSequenceLocs[0].back())) 8285 SequenceSize += NumBytesToCheckLRInTCEpilogue; 8286 8287 // We have to check if sp modifying instructions would get outlined. 8288 // If so we only allow outlining if sp is unchanged overall, so matching 8289 // sub and add instructions are okay to outline, all other sp modifications 8290 // are not 8291 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 8292 int SPValue = 0; 8293 MachineBasicBlock::iterator MBBI = C.front(); 8294 for (;;) { 8295 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 8296 switch (MBBI->getOpcode()) { 8297 case AArch64::ADDXri: 8298 case AArch64::ADDWri: 8299 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 8300 assert(MBBI->getOperand(2).isImm() && 8301 "Expected operand to be immediate"); 8302 assert(MBBI->getOperand(1).isReg() && 8303 "Expected operand to be a register"); 8304 // Check if the add just increments sp. If so, we search for 8305 // matching sub instructions that decrement sp. If not, the 8306 // modification is illegal 8307 if (MBBI->getOperand(1).getReg() == AArch64::SP) 8308 SPValue += MBBI->getOperand(2).getImm(); 8309 else 8310 return true; 8311 break; 8312 case AArch64::SUBXri: 8313 case AArch64::SUBWri: 8314 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 8315 assert(MBBI->getOperand(2).isImm() && 8316 "Expected operand to be immediate"); 8317 assert(MBBI->getOperand(1).isReg() && 8318 "Expected operand to be a register"); 8319 // Check if the sub just decrements sp. If so, we search for 8320 // matching add instructions that increment sp. If not, the 8321 // modification is illegal 8322 if (MBBI->getOperand(1).getReg() == AArch64::SP) 8323 SPValue -= MBBI->getOperand(2).getImm(); 8324 else 8325 return true; 8326 break; 8327 default: 8328 return true; 8329 } 8330 } 8331 if (MBBI == C.back()) 8332 break; 8333 ++MBBI; 8334 } 8335 if (SPValue) 8336 return true; 8337 return false; 8338 }; 8339 // Remove candidates with illegal stack modifying instructions 8340 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 8341 8342 // If the sequence doesn't have enough candidates left, then we're done. 8343 if (RepeatedSequenceLocs.size() < 2) 8344 return std::nullopt; 8345 } 8346 8347 // Properties about candidate MBBs that hold for all of them. 8348 unsigned FlagsSetInAll = 0xF; 8349 8350 // Compute liveness information for each candidate, and set FlagsSetInAll. 8351 for (outliner::Candidate &C : RepeatedSequenceLocs) 8352 FlagsSetInAll &= C.Flags; 8353 8354 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 8355 8356 // Helper lambda which sets call information for every candidate. 8357 auto SetCandidateCallInfo = 8358 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 8359 for (outliner::Candidate &C : RepeatedSequenceLocs) 8360 C.setCallInfo(CallID, NumBytesForCall); 8361 }; 8362 8363 unsigned FrameID = MachineOutlinerDefault; 8364 NumBytesToCreateFrame += 4; 8365 8366 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 8367 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 8368 }); 8369 8370 // We check to see if CFI Instructions are present, and if they are 8371 // we find the number of CFI Instructions in the candidates. 8372 unsigned CFICount = 0; 8373 for (auto &I : make_range(RepeatedSequenceLocs[0].front(), 8374 std::next(RepeatedSequenceLocs[0].back()))) { 8375 if (I.isCFIInstruction()) 8376 CFICount++; 8377 } 8378 8379 // We compare the number of found CFI Instructions to the number of CFI 8380 // instructions in the parent function for each candidate. We must check this 8381 // since if we outline one of the CFI instructions in a function, we have to 8382 // outline them all for correctness. If we do not, the address offsets will be 8383 // incorrect between the two sections of the program. 8384 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8385 std::vector<MCCFIInstruction> CFIInstructions = 8386 C.getMF()->getFrameInstructions(); 8387 8388 if (CFICount > 0 && CFICount != CFIInstructions.size()) 8389 return std::nullopt; 8390 } 8391 8392 // Returns true if an instructions is safe to fix up, false otherwise. 8393 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 8394 if (MI.isCall()) 8395 return true; 8396 8397 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 8398 !MI.readsRegister(AArch64::SP, &TRI)) 8399 return true; 8400 8401 // Any modification of SP will break our code to save/restore LR. 8402 // FIXME: We could handle some instructions which add a constant 8403 // offset to SP, with a bit more work. 8404 if (MI.modifiesRegister(AArch64::SP, &TRI)) 8405 return false; 8406 8407 // At this point, we have a stack instruction that we might need to 8408 // fix up. We'll handle it if it's a load or store. 8409 if (MI.mayLoadOrStore()) { 8410 const MachineOperand *Base; // Filled with the base operand of MI. 8411 int64_t Offset; // Filled with the offset of MI. 8412 bool OffsetIsScalable; 8413 8414 // Does it allow us to offset the base operand and is the base the 8415 // register SP? 8416 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 8417 !Base->isReg() || Base->getReg() != AArch64::SP) 8418 return false; 8419 8420 // Fixe-up code below assumes bytes. 8421 if (OffsetIsScalable) 8422 return false; 8423 8424 // Find the minimum/maximum offset for this instruction and check 8425 // if fixing it up would be in range. 8426 int64_t MinOffset, 8427 MaxOffset; // Unscaled offsets for the instruction. 8428 // The scale to multiply the offsets by. 8429 TypeSize Scale(0U, false), DummyWidth(0U, false); 8430 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 8431 8432 Offset += 16; // Update the offset to what it would be if we outlined. 8433 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 8434 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 8435 return false; 8436 8437 // It's in range, so we can outline it. 8438 return true; 8439 } 8440 8441 // FIXME: Add handling for instructions like "add x0, sp, #8". 8442 8443 // We can't fix it up, so don't outline it. 8444 return false; 8445 }; 8446 8447 // True if it's possible to fix up each stack instruction in this sequence. 8448 // Important for frames/call variants that modify the stack. 8449 bool AllStackInstrsSafe = std::all_of( 8450 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 8451 8452 // If the last instruction in any candidate is a terminator, then we should 8453 // tail call all of the candidates. 8454 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 8455 FrameID = MachineOutlinerTailCall; 8456 NumBytesToCreateFrame = 0; 8457 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue; 8458 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall); 8459 } 8460 8461 else if (LastInstrOpcode == AArch64::BL || 8462 ((LastInstrOpcode == AArch64::BLR || 8463 LastInstrOpcode == AArch64::BLRNoIP) && 8464 !HasBTI)) { 8465 // FIXME: Do we need to check if the code after this uses the value of LR? 8466 FrameID = MachineOutlinerThunk; 8467 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue; 8468 SetCandidateCallInfo(MachineOutlinerThunk, 4); 8469 } 8470 8471 else { 8472 // We need to decide how to emit calls + frames. We can always emit the same 8473 // frame if we don't need to save to the stack. If we have to save to the 8474 // stack, then we need a different frame. 8475 unsigned NumBytesNoStackCalls = 0; 8476 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 8477 8478 // Check if we have to save LR. 8479 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8480 bool LRAvailable = 8481 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 8482 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 8483 : true; 8484 // If we have a noreturn caller, then we're going to be conservative and 8485 // say that we have to save LR. If we don't have a ret at the end of the 8486 // block, then we can't reason about liveness accurately. 8487 // 8488 // FIXME: We can probably do better than always disabling this in 8489 // noreturn functions by fixing up the liveness info. 8490 bool IsNoReturn = 8491 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 8492 8493 // Is LR available? If so, we don't need a save. 8494 if (LRAvailable && !IsNoReturn) { 8495 NumBytesNoStackCalls += 4; 8496 C.setCallInfo(MachineOutlinerNoLRSave, 4); 8497 CandidatesWithoutStackFixups.push_back(C); 8498 } 8499 8500 // Is an unused register available? If so, we won't modify the stack, so 8501 // we can outline with the same frame type as those that don't save LR. 8502 else if (findRegisterToSaveLRTo(C)) { 8503 NumBytesNoStackCalls += 12; 8504 C.setCallInfo(MachineOutlinerRegSave, 12); 8505 CandidatesWithoutStackFixups.push_back(C); 8506 } 8507 8508 // Is SP used in the sequence at all? If not, we don't have to modify 8509 // the stack, so we are guaranteed to get the same frame. 8510 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 8511 NumBytesNoStackCalls += 12; 8512 C.setCallInfo(MachineOutlinerDefault, 12); 8513 CandidatesWithoutStackFixups.push_back(C); 8514 } 8515 8516 // If we outline this, we need to modify the stack. Pretend we don't 8517 // outline this by saving all of its bytes. 8518 else { 8519 NumBytesNoStackCalls += SequenceSize; 8520 } 8521 } 8522 8523 // If there are no places where we have to save LR, then note that we 8524 // don't have to update the stack. Otherwise, give every candidate the 8525 // default call type, as long as it's safe to do so. 8526 if (!AllStackInstrsSafe || 8527 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 8528 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 8529 FrameID = MachineOutlinerNoLRSave; 8530 } else { 8531 SetCandidateCallInfo(MachineOutlinerDefault, 12); 8532 8533 // Bugzilla ID: 46767 8534 // TODO: Check if fixing up the stack more than once is safe so we can 8535 // outline these. 8536 // 8537 // An outline resulting in a caller that requires stack fixups at the 8538 // callsite to a callee that also requires stack fixups can happen when 8539 // there are no available registers at the candidate callsite for a 8540 // candidate that itself also has calls. 8541 // 8542 // In other words if function_containing_sequence in the following pseudo 8543 // assembly requires that we save LR at the point of the call, but there 8544 // are no available registers: in this case we save using SP and as a 8545 // result the SP offsets requires stack fixups by multiples of 16. 8546 // 8547 // function_containing_sequence: 8548 // ... 8549 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8550 // call OUTLINED_FUNCTION_N 8551 // restore LR from SP 8552 // ... 8553 // 8554 // OUTLINED_FUNCTION_N: 8555 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8556 // ... 8557 // bl foo 8558 // restore LR from SP 8559 // ret 8560 // 8561 // Because the code to handle more than one stack fixup does not 8562 // currently have the proper checks for legality, these cases will assert 8563 // in the AArch64 MachineOutliner. This is because the code to do this 8564 // needs more hardening, testing, better checks that generated code is 8565 // legal, etc and because it is only verified to handle a single pass of 8566 // stack fixup. 8567 // 8568 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 8569 // these cases until they are known to be handled. Bugzilla 46767 is 8570 // referenced in comments at the assert site. 8571 // 8572 // To avoid asserting (or generating non-legal code on noassert builds) 8573 // we remove all candidates which would need more than one stack fixup by 8574 // pruning the cases where the candidate has calls while also having no 8575 // available LR and having no available general purpose registers to copy 8576 // LR to (ie one extra stack save/restore). 8577 // 8578 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8579 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 8580 return (std::any_of( 8581 C.front(), std::next(C.back()), 8582 [](const MachineInstr &MI) { return MI.isCall(); })) && 8583 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 8584 !findRegisterToSaveLRTo(C)); 8585 }); 8586 } 8587 } 8588 8589 // If we dropped all of the candidates, bail out here. 8590 if (RepeatedSequenceLocs.size() < 2) { 8591 RepeatedSequenceLocs.clear(); 8592 return std::nullopt; 8593 } 8594 } 8595 8596 // Does every candidate's MBB contain a call? If so, then we might have a call 8597 // in the range. 8598 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8599 // Check if the range contains a call. These require a save + restore of the 8600 // link register. 8601 bool ModStackToSaveLR = false; 8602 if (std::any_of(FirstCand.front(), FirstCand.back(), 8603 [](const MachineInstr &MI) { return MI.isCall(); })) 8604 ModStackToSaveLR = true; 8605 8606 // Handle the last instruction separately. If this is a tail call, then the 8607 // last instruction is a call. We don't want to save + restore in this case. 8608 // However, it could be possible that the last instruction is a call without 8609 // it being valid to tail call this sequence. We should consider this as 8610 // well. 8611 else if (FrameID != MachineOutlinerThunk && 8612 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 8613 ModStackToSaveLR = true; 8614 8615 if (ModStackToSaveLR) { 8616 // We can't fix up the stack. Bail out. 8617 if (!AllStackInstrsSafe) { 8618 RepeatedSequenceLocs.clear(); 8619 return std::nullopt; 8620 } 8621 8622 // Save + restore LR. 8623 NumBytesToCreateFrame += 8; 8624 } 8625 } 8626 8627 // If we have CFI instructions, we can only outline if the outlined section 8628 // can be a tail call 8629 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 8630 return std::nullopt; 8631 8632 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 8633 NumBytesToCreateFrame, FrameID); 8634 } 8635 8636 void AArch64InstrInfo::mergeOutliningCandidateAttributes( 8637 Function &F, std::vector<outliner::Candidate> &Candidates) const { 8638 // If a bunch of candidates reach this point they must agree on their return 8639 // address signing. It is therefore enough to just consider the signing 8640 // behaviour of one of them 8641 const auto &CFn = Candidates.front().getMF()->getFunction(); 8642 8643 // Since all candidates belong to the same module, just copy the 8644 // function-level attributes of an arbitrary function. 8645 if (CFn.hasFnAttribute("sign-return-address")) 8646 F.addFnAttr(CFn.getFnAttribute("sign-return-address")); 8647 if (CFn.hasFnAttribute("sign-return-address-key")) 8648 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key")); 8649 8650 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates); 8651 } 8652 8653 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 8654 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 8655 const Function &F = MF.getFunction(); 8656 8657 // Can F be deduplicated by the linker? If it can, don't outline from it. 8658 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 8659 return false; 8660 8661 // Don't outline from functions with section markings; the program could 8662 // expect that all the code is in the named section. 8663 // FIXME: Allow outlining from multiple functions with the same section 8664 // marking. 8665 if (F.hasSection()) 8666 return false; 8667 8668 // Outlining from functions with redzones is unsafe since the outliner may 8669 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 8670 // outline from it. 8671 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 8672 if (!AFI || AFI->hasRedZone().value_or(true)) 8673 return false; 8674 8675 // FIXME: Teach the outliner to generate/handle Windows unwind info. 8676 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 8677 return false; 8678 8679 // It's safe to outline from MF. 8680 return true; 8681 } 8682 8683 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8684 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 8685 unsigned &Flags) const { 8686 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 8687 "Must track liveness!"); 8688 SmallVector< 8689 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8690 Ranges; 8691 // According to the AArch64 Procedure Call Standard, the following are 8692 // undefined on entry/exit from a function call: 8693 // 8694 // * Registers x16, x17, (and thus w16, w17) 8695 // * Condition codes (and thus the NZCV register) 8696 // 8697 // If any of these registers are used inside or live across an outlined 8698 // function, then they may be modified later, either by the compiler or 8699 // some other tool (like the linker). 8700 // 8701 // To avoid outlining in these situations, partition each block into ranges 8702 // where these registers are dead. We will only outline from those ranges. 8703 LiveRegUnits LRU(getRegisterInfo()); 8704 auto AreAllUnsafeRegsDead = [&LRU]() { 8705 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 8706 LRU.available(AArch64::NZCV); 8707 }; 8708 8709 // We need to know if LR is live across an outlining boundary later on in 8710 // order to decide how we'll create the outlined call, frame, etc. 8711 // 8712 // It's pretty expensive to check this for *every candidate* within a block. 8713 // That's some potentially n^2 behaviour, since in the worst case, we'd need 8714 // to compute liveness from the end of the block for O(n) candidates within 8715 // the block. 8716 // 8717 // So, to improve the average case, let's keep track of liveness from the end 8718 // of the block to the beginning of *every outlinable range*. If we know that 8719 // LR is available in every range we could outline from, then we know that 8720 // we don't need to check liveness for any candidate within that range. 8721 bool LRAvailableEverywhere = true; 8722 // Compute liveness bottom-up. 8723 LRU.addLiveOuts(MBB); 8724 // Update flags that require info about the entire MBB. 8725 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 8726 if (MI.isCall() && !MI.isTerminator()) 8727 Flags |= MachineOutlinerMBBFlags::HasCalls; 8728 }; 8729 // Range: [RangeBegin, RangeEnd) 8730 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 8731 unsigned RangeLen; 8732 auto CreateNewRangeStartingAt = 8733 [&RangeBegin, &RangeEnd, 8734 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 8735 RangeBegin = NewBegin; 8736 RangeEnd = std::next(RangeBegin); 8737 RangeLen = 0; 8738 }; 8739 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 8740 // At least one unsafe register is not dead. We do not want to outline at 8741 // this point. If it is long enough to outline from, save the range 8742 // [RangeBegin, RangeEnd). 8743 if (RangeLen > 1) 8744 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 8745 }; 8746 // Find the first point where all unsafe registers are dead. 8747 // FIND: <safe instr> <-- end of first potential range 8748 // SKIP: <unsafe def> 8749 // SKIP: ... everything between ... 8750 // SKIP: <unsafe use> 8751 auto FirstPossibleEndPt = MBB.instr_rbegin(); 8752 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 8753 LRU.stepBackward(*FirstPossibleEndPt); 8754 // Update flags that impact how we outline across the entire block, 8755 // regardless of safety. 8756 UpdateWholeMBBFlags(*FirstPossibleEndPt); 8757 if (AreAllUnsafeRegsDead()) 8758 break; 8759 } 8760 // If we exhausted the entire block, we have no safe ranges to outline. 8761 if (FirstPossibleEndPt == MBB.instr_rend()) 8762 return Ranges; 8763 // Current range. 8764 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 8765 // StartPt points to the first place where all unsafe registers 8766 // are dead (if there is any such point). Begin partitioning the MBB into 8767 // ranges. 8768 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 8769 LRU.stepBackward(MI); 8770 UpdateWholeMBBFlags(MI); 8771 if (!AreAllUnsafeRegsDead()) { 8772 SaveRangeIfNonEmpty(); 8773 CreateNewRangeStartingAt(MI.getIterator()); 8774 continue; 8775 } 8776 LRAvailableEverywhere &= LRU.available(AArch64::LR); 8777 RangeBegin = MI.getIterator(); 8778 ++RangeLen; 8779 } 8780 // Above loop misses the last (or only) range. If we are still safe, then 8781 // let's save the range. 8782 if (AreAllUnsafeRegsDead()) 8783 SaveRangeIfNonEmpty(); 8784 if (Ranges.empty()) 8785 return Ranges; 8786 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 8787 // the order. 8788 std::reverse(Ranges.begin(), Ranges.end()); 8789 // If there is at least one outlinable range where LR is unavailable 8790 // somewhere, remember that. 8791 if (!LRAvailableEverywhere) 8792 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 8793 return Ranges; 8794 } 8795 8796 outliner::InstrType 8797 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 8798 unsigned Flags) const { 8799 MachineInstr &MI = *MIT; 8800 MachineBasicBlock *MBB = MI.getParent(); 8801 MachineFunction *MF = MBB->getParent(); 8802 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 8803 8804 // Don't outline anything used for return address signing. The outlined 8805 // function will get signed later if needed 8806 switch (MI.getOpcode()) { 8807 case AArch64::PACM: 8808 case AArch64::PACIASP: 8809 case AArch64::PACIBSP: 8810 case AArch64::PACIASPPC: 8811 case AArch64::PACIBSPPC: 8812 case AArch64::AUTIASP: 8813 case AArch64::AUTIBSP: 8814 case AArch64::AUTIASPPCi: 8815 case AArch64::AUTIASPPCr: 8816 case AArch64::AUTIBSPPCi: 8817 case AArch64::AUTIBSPPCr: 8818 case AArch64::RETAA: 8819 case AArch64::RETAB: 8820 case AArch64::RETAASPPCi: 8821 case AArch64::RETAASPPCr: 8822 case AArch64::RETABSPPCi: 8823 case AArch64::RETABSPPCr: 8824 case AArch64::EMITBKEY: 8825 case AArch64::PAUTH_PROLOGUE: 8826 case AArch64::PAUTH_EPILOGUE: 8827 return outliner::InstrType::Illegal; 8828 } 8829 8830 // Don't outline LOHs. 8831 if (FuncInfo->getLOHRelated().count(&MI)) 8832 return outliner::InstrType::Illegal; 8833 8834 // We can only outline these if we will tail call the outlined function, or 8835 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 8836 // in a tail call. 8837 // 8838 // FIXME: If the proper fixups for the offset are implemented, this should be 8839 // possible. 8840 if (MI.isCFIInstruction()) 8841 return outliner::InstrType::Legal; 8842 8843 // Is this a terminator for a basic block? 8844 if (MI.isTerminator()) 8845 // TargetInstrInfo::getOutliningType has already filtered out anything 8846 // that would break this, so we can allow it here. 8847 return outliner::InstrType::Legal; 8848 8849 // Make sure none of the operands are un-outlinable. 8850 for (const MachineOperand &MOP : MI.operands()) { 8851 // A check preventing CFI indices was here before, but only CFI 8852 // instructions should have those. 8853 assert(!MOP.isCFIIndex()); 8854 8855 // If it uses LR or W30 explicitly, then don't touch it. 8856 if (MOP.isReg() && !MOP.isImplicit() && 8857 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 8858 return outliner::InstrType::Illegal; 8859 } 8860 8861 // Special cases for instructions that can always be outlined, but will fail 8862 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 8863 // be outlined because they don't require a *specific* value to be in LR. 8864 if (MI.getOpcode() == AArch64::ADRP) 8865 return outliner::InstrType::Legal; 8866 8867 // If MI is a call we might be able to outline it. We don't want to outline 8868 // any calls that rely on the position of items on the stack. When we outline 8869 // something containing a call, we have to emit a save and restore of LR in 8870 // the outlined function. Currently, this always happens by saving LR to the 8871 // stack. Thus, if we outline, say, half the parameters for a function call 8872 // plus the call, then we'll break the callee's expectations for the layout 8873 // of the stack. 8874 // 8875 // FIXME: Allow calls to functions which construct a stack frame, as long 8876 // as they don't access arguments on the stack. 8877 // FIXME: Figure out some way to analyze functions defined in other modules. 8878 // We should be able to compute the memory usage based on the IR calling 8879 // convention, even if we can't see the definition. 8880 if (MI.isCall()) { 8881 // Get the function associated with the call. Look at each operand and find 8882 // the one that represents the callee and get its name. 8883 const Function *Callee = nullptr; 8884 for (const MachineOperand &MOP : MI.operands()) { 8885 if (MOP.isGlobal()) { 8886 Callee = dyn_cast<Function>(MOP.getGlobal()); 8887 break; 8888 } 8889 } 8890 8891 // Never outline calls to mcount. There isn't any rule that would require 8892 // this, but the Linux kernel's "ftrace" feature depends on it. 8893 if (Callee && Callee->getName() == "\01_mcount") 8894 return outliner::InstrType::Illegal; 8895 8896 // If we don't know anything about the callee, assume it depends on the 8897 // stack layout of the caller. In that case, it's only legal to outline 8898 // as a tail-call. Explicitly list the call instructions we know about so we 8899 // don't get unexpected results with call pseudo-instructions. 8900 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 8901 if (MI.getOpcode() == AArch64::BLR || 8902 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 8903 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 8904 8905 if (!Callee) 8906 return UnknownCallOutlineType; 8907 8908 // We have a function we have information about. Check it if it's something 8909 // can safely outline. 8910 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 8911 8912 // We don't know what's going on with the callee at all. Don't touch it. 8913 if (!CalleeMF) 8914 return UnknownCallOutlineType; 8915 8916 // Check if we know anything about the callee saves on the function. If we 8917 // don't, then don't touch it, since that implies that we haven't 8918 // computed anything about its stack frame yet. 8919 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 8920 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 8921 MFI.getNumObjects() > 0) 8922 return UnknownCallOutlineType; 8923 8924 // At this point, we can say that CalleeMF ought to not pass anything on the 8925 // stack. Therefore, we can outline it. 8926 return outliner::InstrType::Legal; 8927 } 8928 8929 // Don't touch the link register or W30. 8930 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 8931 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 8932 return outliner::InstrType::Illegal; 8933 8934 // Don't outline BTI instructions, because that will prevent the outlining 8935 // site from being indirectly callable. 8936 if (hasBTISemantics(MI)) 8937 return outliner::InstrType::Illegal; 8938 8939 return outliner::InstrType::Legal; 8940 } 8941 8942 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 8943 for (MachineInstr &MI : MBB) { 8944 const MachineOperand *Base; 8945 TypeSize Width(0, false); 8946 int64_t Offset; 8947 bool OffsetIsScalable; 8948 8949 // Is this a load or store with an immediate offset with SP as the base? 8950 if (!MI.mayLoadOrStore() || 8951 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 8952 &RI) || 8953 (Base->isReg() && Base->getReg() != AArch64::SP)) 8954 continue; 8955 8956 // It is, so we have to fix it up. 8957 TypeSize Scale(0U, false); 8958 int64_t Dummy1, Dummy2; 8959 8960 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 8961 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 8962 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 8963 assert(Scale != 0 && "Unexpected opcode!"); 8964 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 8965 8966 // We've pushed the return address to the stack, so add 16 to the offset. 8967 // This is safe, since we already checked if it would overflow when we 8968 // checked if this instruction was legal to outline. 8969 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 8970 StackOffsetOperand.setImm(NewImm); 8971 } 8972 } 8973 8974 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 8975 const AArch64InstrInfo *TII, 8976 bool ShouldSignReturnAddr) { 8977 if (!ShouldSignReturnAddr) 8978 return; 8979 8980 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE)) 8981 .setMIFlag(MachineInstr::FrameSetup); 8982 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(), 8983 TII->get(AArch64::PAUTH_EPILOGUE)) 8984 .setMIFlag(MachineInstr::FrameDestroy); 8985 } 8986 8987 void AArch64InstrInfo::buildOutlinedFrame( 8988 MachineBasicBlock &MBB, MachineFunction &MF, 8989 const outliner::OutlinedFunction &OF) const { 8990 8991 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 8992 8993 if (OF.FrameConstructionID == MachineOutlinerTailCall) 8994 FI->setOutliningStyle("Tail Call"); 8995 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 8996 // For thunk outlining, rewrite the last instruction from a call to a 8997 // tail-call. 8998 MachineInstr *Call = &*--MBB.instr_end(); 8999 unsigned TailOpcode; 9000 if (Call->getOpcode() == AArch64::BL) { 9001 TailOpcode = AArch64::TCRETURNdi; 9002 } else { 9003 assert(Call->getOpcode() == AArch64::BLR || 9004 Call->getOpcode() == AArch64::BLRNoIP); 9005 TailOpcode = AArch64::TCRETURNriALL; 9006 } 9007 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 9008 .add(Call->getOperand(0)) 9009 .addImm(0); 9010 MBB.insert(MBB.end(), TC); 9011 Call->eraseFromParent(); 9012 9013 FI->setOutliningStyle("Thunk"); 9014 } 9015 9016 bool IsLeafFunction = true; 9017 9018 // Is there a call in the outlined range? 9019 auto IsNonTailCall = [](const MachineInstr &MI) { 9020 return MI.isCall() && !MI.isReturn(); 9021 }; 9022 9023 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 9024 // Fix up the instructions in the range, since we're going to modify the 9025 // stack. 9026 9027 // Bugzilla ID: 46767 9028 // TODO: Check if fixing up twice is safe so we can outline these. 9029 assert(OF.FrameConstructionID != MachineOutlinerDefault && 9030 "Can only fix up stack references once"); 9031 fixupPostOutline(MBB); 9032 9033 IsLeafFunction = false; 9034 9035 // LR has to be a live in so that we can save it. 9036 if (!MBB.isLiveIn(AArch64::LR)) 9037 MBB.addLiveIn(AArch64::LR); 9038 9039 MachineBasicBlock::iterator It = MBB.begin(); 9040 MachineBasicBlock::iterator Et = MBB.end(); 9041 9042 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9043 OF.FrameConstructionID == MachineOutlinerThunk) 9044 Et = std::prev(MBB.end()); 9045 9046 // Insert a save before the outlined region 9047 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9048 .addReg(AArch64::SP, RegState::Define) 9049 .addReg(AArch64::LR) 9050 .addReg(AArch64::SP) 9051 .addImm(-16); 9052 It = MBB.insert(It, STRXpre); 9053 9054 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 9055 const TargetSubtargetInfo &STI = MF.getSubtarget(); 9056 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 9057 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 9058 9059 // Add a CFI saying the stack was moved 16 B down. 9060 int64_t StackPosEntry = 9061 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 9062 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9063 .addCFIIndex(StackPosEntry) 9064 .setMIFlags(MachineInstr::FrameSetup); 9065 9066 // Add a CFI saying that the LR that we want to find is now 16 B higher 9067 // than before. 9068 int64_t LRPosEntry = MF.addFrameInst( 9069 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 9070 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9071 .addCFIIndex(LRPosEntry) 9072 .setMIFlags(MachineInstr::FrameSetup); 9073 } 9074 9075 // Insert a restore before the terminator for the function. 9076 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9077 .addReg(AArch64::SP, RegState::Define) 9078 .addReg(AArch64::LR, RegState::Define) 9079 .addReg(AArch64::SP) 9080 .addImm(16); 9081 Et = MBB.insert(Et, LDRXpost); 9082 } 9083 9084 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction); 9085 9086 // If this is a tail call outlined function, then there's already a return. 9087 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9088 OF.FrameConstructionID == MachineOutlinerThunk) { 9089 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9090 return; 9091 } 9092 9093 // It's not a tail call, so we have to insert the return ourselves. 9094 9095 // LR has to be a live in so that we can return to it. 9096 if (!MBB.isLiveIn(AArch64::LR)) 9097 MBB.addLiveIn(AArch64::LR); 9098 9099 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 9100 .addReg(AArch64::LR); 9101 MBB.insert(MBB.end(), ret); 9102 9103 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9104 9105 FI->setOutliningStyle("Function"); 9106 9107 // Did we have to modify the stack by saving the link register? 9108 if (OF.FrameConstructionID != MachineOutlinerDefault) 9109 return; 9110 9111 // We modified the stack. 9112 // Walk over the basic block and fix up all the stack accesses. 9113 fixupPostOutline(MBB); 9114 } 9115 9116 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 9117 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 9118 MachineFunction &MF, outliner::Candidate &C) const { 9119 9120 // Are we tail calling? 9121 if (C.CallConstructionID == MachineOutlinerTailCall) { 9122 // If yes, then we can just branch to the label. 9123 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 9124 .addGlobalAddress(M.getNamedValue(MF.getName())) 9125 .addImm(0)); 9126 return It; 9127 } 9128 9129 // Are we saving the link register? 9130 if (C.CallConstructionID == MachineOutlinerNoLRSave || 9131 C.CallConstructionID == MachineOutlinerThunk) { 9132 // No, so just insert the call. 9133 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9134 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9135 return It; 9136 } 9137 9138 // We want to return the spot where we inserted the call. 9139 MachineBasicBlock::iterator CallPt; 9140 9141 // Instructions for saving and restoring LR around the call instruction we're 9142 // going to insert. 9143 MachineInstr *Save; 9144 MachineInstr *Restore; 9145 // Can we save to a register? 9146 if (C.CallConstructionID == MachineOutlinerRegSave) { 9147 // FIXME: This logic should be sunk into a target-specific interface so that 9148 // we don't have to recompute the register. 9149 Register Reg = findRegisterToSaveLRTo(C); 9150 assert(Reg && "No callee-saved register available?"); 9151 9152 // LR has to be a live in so that we can save it. 9153 if (!MBB.isLiveIn(AArch64::LR)) 9154 MBB.addLiveIn(AArch64::LR); 9155 9156 // Save and restore LR from Reg. 9157 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 9158 .addReg(AArch64::XZR) 9159 .addReg(AArch64::LR) 9160 .addImm(0); 9161 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 9162 .addReg(AArch64::XZR) 9163 .addReg(Reg) 9164 .addImm(0); 9165 } else { 9166 // We have the default case. Save and restore from SP. 9167 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9168 .addReg(AArch64::SP, RegState::Define) 9169 .addReg(AArch64::LR) 9170 .addReg(AArch64::SP) 9171 .addImm(-16); 9172 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9173 .addReg(AArch64::SP, RegState::Define) 9174 .addReg(AArch64::LR, RegState::Define) 9175 .addReg(AArch64::SP) 9176 .addImm(16); 9177 } 9178 9179 It = MBB.insert(It, Save); 9180 It++; 9181 9182 // Insert the call. 9183 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9184 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9185 CallPt = It; 9186 It++; 9187 9188 It = MBB.insert(It, Restore); 9189 return CallPt; 9190 } 9191 9192 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 9193 MachineFunction &MF) const { 9194 return MF.getFunction().hasMinSize(); 9195 } 9196 9197 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, 9198 MachineBasicBlock::iterator Iter, 9199 DebugLoc &DL, 9200 bool AllowSideEffects) const { 9201 const MachineFunction &MF = *MBB.getParent(); 9202 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); 9203 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); 9204 9205 if (TRI.isGeneralPurposeRegister(MF, Reg)) { 9206 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0); 9207 } else if (STI.hasSVE()) { 9208 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg) 9209 .addImm(0) 9210 .addImm(0); 9211 } else { 9212 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg) 9213 .addImm(0); 9214 } 9215 } 9216 9217 std::optional<DestSourcePair> 9218 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 9219 9220 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 9221 // and zero immediate operands used as an alias for mov instruction. 9222 if (MI.getOpcode() == AArch64::ORRWrs && 9223 MI.getOperand(1).getReg() == AArch64::WZR && 9224 MI.getOperand(3).getImm() == 0x0 && 9225 // Check that the w->w move is not a zero-extending w->x mov. 9226 (!MI.getOperand(0).getReg().isVirtual() || 9227 MI.getOperand(0).getSubReg() == 0) && 9228 (!MI.getOperand(0).getReg().isPhysical() || 9229 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 + 9230 AArch64::X0) == -1)) 9231 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9232 9233 if (MI.getOpcode() == AArch64::ORRXrs && 9234 MI.getOperand(1).getReg() == AArch64::XZR && 9235 MI.getOperand(3).getImm() == 0x0) 9236 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9237 9238 return std::nullopt; 9239 } 9240 9241 std::optional<DestSourcePair> 9242 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const { 9243 if (MI.getOpcode() == AArch64::ORRWrs && 9244 MI.getOperand(1).getReg() == AArch64::WZR && 9245 MI.getOperand(3).getImm() == 0x0) 9246 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9247 return std::nullopt; 9248 } 9249 9250 std::optional<RegImmPair> 9251 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 9252 int Sign = 1; 9253 int64_t Offset = 0; 9254 9255 // TODO: Handle cases where Reg is a super- or sub-register of the 9256 // destination register. 9257 const MachineOperand &Op0 = MI.getOperand(0); 9258 if (!Op0.isReg() || Reg != Op0.getReg()) 9259 return std::nullopt; 9260 9261 switch (MI.getOpcode()) { 9262 default: 9263 return std::nullopt; 9264 case AArch64::SUBWri: 9265 case AArch64::SUBXri: 9266 case AArch64::SUBSWri: 9267 case AArch64::SUBSXri: 9268 Sign *= -1; 9269 [[fallthrough]]; 9270 case AArch64::ADDSWri: 9271 case AArch64::ADDSXri: 9272 case AArch64::ADDWri: 9273 case AArch64::ADDXri: { 9274 // TODO: Third operand can be global address (usually some string). 9275 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 9276 !MI.getOperand(2).isImm()) 9277 return std::nullopt; 9278 int Shift = MI.getOperand(3).getImm(); 9279 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 9280 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 9281 } 9282 } 9283 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 9284 } 9285 9286 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 9287 /// the destination register then, if possible, describe the value in terms of 9288 /// the source register. 9289 static std::optional<ParamLoadedValue> 9290 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 9291 const TargetInstrInfo *TII, 9292 const TargetRegisterInfo *TRI) { 9293 auto DestSrc = TII->isCopyLikeInstr(MI); 9294 if (!DestSrc) 9295 return std::nullopt; 9296 9297 Register DestReg = DestSrc->Destination->getReg(); 9298 Register SrcReg = DestSrc->Source->getReg(); 9299 9300 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 9301 9302 // If the described register is the destination, just return the source. 9303 if (DestReg == DescribedReg) 9304 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9305 9306 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 9307 if (MI.getOpcode() == AArch64::ORRWrs && 9308 TRI->isSuperRegister(DestReg, DescribedReg)) 9309 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9310 9311 // We may need to describe the lower part of a ORRXrs move. 9312 if (MI.getOpcode() == AArch64::ORRXrs && 9313 TRI->isSubRegister(DestReg, DescribedReg)) { 9314 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 9315 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 9316 } 9317 9318 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 9319 "Unhandled ORR[XW]rs copy case"); 9320 9321 return std::nullopt; 9322 } 9323 9324 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const { 9325 // Functions cannot be split to different sections on AArch64 if they have 9326 // a red zone. This is because relaxing a cross-section branch may require 9327 // incrementing the stack pointer to spill a register, which would overwrite 9328 // the red zone. 9329 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true)) 9330 return false; 9331 9332 return TargetInstrInfo::isFunctionSafeToSplit(MF); 9333 } 9334 9335 bool AArch64InstrInfo::isMBBSafeToSplitToCold( 9336 const MachineBasicBlock &MBB) const { 9337 // Asm Goto blocks can contain conditional branches to goto labels, which can 9338 // get moved out of range of the branch instruction. 9339 auto isAsmGoto = [](const MachineInstr &MI) { 9340 return MI.getOpcode() == AArch64::INLINEASM_BR; 9341 }; 9342 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget()) 9343 return false; 9344 9345 // Because jump tables are label-relative instead of table-relative, they all 9346 // must be in the same section or relocation fixup handling will fail. 9347 9348 // Check if MBB is a jump table target 9349 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo(); 9350 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) { 9351 return llvm::is_contained(JTE.MBBs, &MBB); 9352 }; 9353 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB)) 9354 return false; 9355 9356 // Check if MBB contains a jump table lookup 9357 for (const MachineInstr &MI : MBB) { 9358 switch (MI.getOpcode()) { 9359 case TargetOpcode::G_BRJT: 9360 case AArch64::JumpTableDest32: 9361 case AArch64::JumpTableDest16: 9362 case AArch64::JumpTableDest8: 9363 return false; 9364 default: 9365 continue; 9366 } 9367 } 9368 9369 // MBB isn't a special case, so it's safe to be split to the cold section. 9370 return true; 9371 } 9372 9373 std::optional<ParamLoadedValue> 9374 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 9375 Register Reg) const { 9376 const MachineFunction *MF = MI.getMF(); 9377 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 9378 switch (MI.getOpcode()) { 9379 case AArch64::MOVZWi: 9380 case AArch64::MOVZXi: { 9381 // MOVZWi may be used for producing zero-extended 32-bit immediates in 9382 // 64-bit parameters, so we need to consider super-registers. 9383 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 9384 return std::nullopt; 9385 9386 if (!MI.getOperand(1).isImm()) 9387 return std::nullopt; 9388 int64_t Immediate = MI.getOperand(1).getImm(); 9389 int Shift = MI.getOperand(2).getImm(); 9390 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 9391 nullptr); 9392 } 9393 case AArch64::ORRWrs: 9394 case AArch64::ORRXrs: 9395 return describeORRLoadedValue(MI, Reg, this, TRI); 9396 } 9397 9398 return TargetInstrInfo::describeLoadedValue(MI, Reg); 9399 } 9400 9401 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 9402 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 9403 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 9404 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 9405 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 9406 9407 // Anyexts are nops. 9408 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 9409 return true; 9410 9411 Register DefReg = ExtMI.getOperand(0).getReg(); 9412 if (!MRI.hasOneNonDBGUse(DefReg)) 9413 return false; 9414 9415 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 9416 // addressing mode. 9417 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 9418 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 9419 } 9420 9421 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 9422 return get(Opc).TSFlags & AArch64::ElementSizeMask; 9423 } 9424 9425 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 9426 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 9427 } 9428 9429 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 9430 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 9431 } 9432 9433 unsigned int 9434 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const { 9435 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2; 9436 } 9437 9438 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset, 9439 unsigned Scale) const { 9440 if (Offset && Scale) 9441 return false; 9442 9443 // Check Reg + Imm 9444 if (!Scale) { 9445 // 9-bit signed offset 9446 if (isInt<9>(Offset)) 9447 return true; 9448 9449 // 12-bit unsigned offset 9450 unsigned Shift = Log2_64(NumBytes); 9451 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 9452 // Must be a multiple of NumBytes (NumBytes is a power of 2) 9453 (Offset >> Shift) << Shift == Offset) 9454 return true; 9455 return false; 9456 } 9457 9458 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 9459 return Scale == 1 || (Scale > 0 && Scale == NumBytes); 9460 } 9461 9462 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 9463 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 9464 return AArch64::BLRNoIP; 9465 else 9466 return AArch64::BLR; 9467 } 9468 9469 bool AArch64InstrInfo::isReallyTriviallyReMaterializable( 9470 const MachineInstr &MI) const { 9471 const MachineFunction &MF = *MI.getMF(); 9472 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); 9473 9474 // If the function contains changes to streaming mode, then there 9475 // is a danger that rematerialised instructions end up between 9476 // instruction sequences (e.g. call sequences, or prolog/epilogue) 9477 // where the streaming-SVE mode is temporarily changed. 9478 if (AFI.hasStreamingModeChanges()) { 9479 // Avoid rematerializing rematerializable instructions that use/define 9480 // scalable values, such as 'pfalse' or 'ptrue', which result in different 9481 // results when the runtime vector length is different. 9482 const MachineRegisterInfo &MRI = MF.getRegInfo(); 9483 const MachineFrameInfo &MFI = MF.getFrameInfo(); 9484 if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) { 9485 if (MO.isFI() && 9486 MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector) 9487 return true; 9488 if (!MO.isReg()) 9489 return false; 9490 9491 if (MO.getReg().isVirtual()) { 9492 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); 9493 return AArch64::ZPRRegClass.hasSubClassEq(RC) || 9494 AArch64::PPRRegClass.hasSubClassEq(RC); 9495 } 9496 return AArch64::ZPRRegClass.contains(MO.getReg()) || 9497 AArch64::PPRRegClass.contains(MO.getReg()); 9498 })) 9499 return false; 9500 9501 // Avoid rematerializing instructions that return a value that is 9502 // different depending on vector length, even when it is not returned 9503 // in a scalable vector/predicate register. 9504 switch (MI.getOpcode()) { 9505 default: 9506 break; 9507 case AArch64::RDVLI_XI: 9508 case AArch64::ADDVL_XXI: 9509 case AArch64::ADDPL_XXI: 9510 case AArch64::CNTB_XPiI: 9511 case AArch64::CNTH_XPiI: 9512 case AArch64::CNTW_XPiI: 9513 case AArch64::CNTD_XPiI: 9514 return false; 9515 } 9516 } 9517 9518 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); 9519 } 9520 9521 MachineBasicBlock::iterator 9522 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, 9523 Register TargetReg, bool FrameSetup) const { 9524 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); 9525 9526 MachineBasicBlock &MBB = *MBBI->getParent(); 9527 MachineFunction &MF = *MBB.getParent(); 9528 const AArch64InstrInfo *TII = 9529 MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); 9530 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize(); 9531 DebugLoc DL = MBB.findDebugLoc(MBBI); 9532 9533 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); 9534 MachineBasicBlock *LoopTestMBB = 9535 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9536 MF.insert(MBBInsertPoint, LoopTestMBB); 9537 MachineBasicBlock *LoopBodyMBB = 9538 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9539 MF.insert(MBBInsertPoint, LoopBodyMBB); 9540 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9541 MF.insert(MBBInsertPoint, ExitMBB); 9542 MachineInstr::MIFlag Flags = 9543 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; 9544 9545 // LoopTest: 9546 // SUB SP, SP, #ProbeSize 9547 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, 9548 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); 9549 9550 // CMP SP, TargetReg 9551 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), 9552 AArch64::XZR) 9553 .addReg(AArch64::SP) 9554 .addReg(TargetReg) 9555 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) 9556 .setMIFlags(Flags); 9557 9558 // B.<Cond> LoopExit 9559 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) 9560 .addImm(AArch64CC::LE) 9561 .addMBB(ExitMBB) 9562 .setMIFlags(Flags); 9563 9564 // STR XZR, [SP] 9565 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) 9566 .addReg(AArch64::XZR) 9567 .addReg(AArch64::SP) 9568 .addImm(0) 9569 .setMIFlags(Flags); 9570 9571 // B loop 9572 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) 9573 .addMBB(LoopTestMBB) 9574 .setMIFlags(Flags); 9575 9576 // LoopExit: 9577 // MOV SP, TargetReg 9578 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) 9579 .addReg(TargetReg) 9580 .addImm(0) 9581 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 9582 .setMIFlags(Flags); 9583 9584 // LDR XZR, [SP] 9585 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui)) 9586 .addReg(AArch64::XZR, RegState::Define) 9587 .addReg(AArch64::SP) 9588 .addImm(0) 9589 .setMIFlags(Flags); 9590 9591 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); 9592 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); 9593 9594 LoopTestMBB->addSuccessor(ExitMBB); 9595 LoopTestMBB->addSuccessor(LoopBodyMBB); 9596 LoopBodyMBB->addSuccessor(LoopTestMBB); 9597 MBB.addSuccessor(LoopTestMBB); 9598 9599 // Update liveins. 9600 if (MF.getRegInfo().reservedRegsFrozen()) { 9601 recomputeLiveIns(*LoopTestMBB); 9602 recomputeLiveIns(*LoopBodyMBB); 9603 recomputeLiveIns(*ExitMBB); 9604 } 9605 9606 return ExitMBB->begin(); 9607 } 9608 9609 #define GET_INSTRINFO_HELPERS 9610 #define GET_INSTRMAP_INFO 9611 #include "AArch64GenInstrInfo.inc" 9612