1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64ExpandImm.h" 15 #include "AArch64FrameLowering.h" 16 #include "AArch64MachineFunctionInfo.h" 17 #include "AArch64PointerAuth.h" 18 #include "AArch64Subtarget.h" 19 #include "MCTargetDesc/AArch64AddressingModes.h" 20 #include "Utils/AArch64BaseInfo.h" 21 #include "llvm/ADT/ArrayRef.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/CodeGen/LivePhysRegs.h" 25 #include "llvm/CodeGen/MachineBasicBlock.h" 26 #include "llvm/CodeGen/MachineCombinerPattern.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/CodeGen/MachineFunction.h" 29 #include "llvm/CodeGen/MachineInstr.h" 30 #include "llvm/CodeGen/MachineInstrBuilder.h" 31 #include "llvm/CodeGen/MachineMemOperand.h" 32 #include "llvm/CodeGen/MachineModuleInfo.h" 33 #include "llvm/CodeGen/MachineOperand.h" 34 #include "llvm/CodeGen/MachineRegisterInfo.h" 35 #include "llvm/CodeGen/RegisterScavenging.h" 36 #include "llvm/CodeGen/StackMaps.h" 37 #include "llvm/CodeGen/TargetRegisterInfo.h" 38 #include "llvm/CodeGen/TargetSubtargetInfo.h" 39 #include "llvm/IR/DebugInfoMetadata.h" 40 #include "llvm/IR/DebugLoc.h" 41 #include "llvm/IR/GlobalValue.h" 42 #include "llvm/MC/MCAsmInfo.h" 43 #include "llvm/MC/MCInst.h" 44 #include "llvm/MC/MCInstBuilder.h" 45 #include "llvm/MC/MCInstrDesc.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CodeGen.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/LEB128.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <cassert> 55 #include <cstdint> 56 #include <iterator> 57 #include <utility> 58 59 using namespace llvm; 60 61 #define GET_INSTRINFO_CTOR_DTOR 62 #include "AArch64GenInstrInfo.inc" 63 64 static cl::opt<unsigned> TBZDisplacementBits( 65 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 66 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 67 68 static cl::opt<unsigned> CBZDisplacementBits( 69 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 70 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 71 72 static cl::opt<unsigned> 73 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 74 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 75 76 static cl::opt<unsigned> 77 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), 78 cl::desc("Restrict range of B instructions (DEBUG)")); 79 80 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 81 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 82 AArch64::CATCHRET), 83 RI(STI.getTargetTriple()), Subtarget(STI) {} 84 85 /// GetInstSize - Return the number of bytes of code the specified 86 /// instruction may be. This returns the maximum number of bytes. 87 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 88 const MachineBasicBlock &MBB = *MI.getParent(); 89 const MachineFunction *MF = MBB.getParent(); 90 const Function &F = MF->getFunction(); 91 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 92 93 { 94 auto Op = MI.getOpcode(); 95 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 96 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 97 } 98 99 // Meta-instructions emit no code. 100 if (MI.isMetaInstruction()) 101 return 0; 102 103 // FIXME: We currently only handle pseudoinstructions that don't get expanded 104 // before the assembly printer. 105 unsigned NumBytes = 0; 106 const MCInstrDesc &Desc = MI.getDesc(); 107 108 // Size should be preferably set in 109 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 110 // Specific cases handle instructions of variable sizes 111 switch (Desc.getOpcode()) { 112 default: 113 if (Desc.getSize()) 114 return Desc.getSize(); 115 116 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 117 // with fixed constant size but not specified in .td file) is a normal 118 // 4-byte insn. 119 NumBytes = 4; 120 break; 121 case TargetOpcode::STACKMAP: 122 // The upper bound for a stackmap intrinsic is the full length of its shadow 123 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 125 break; 126 case TargetOpcode::PATCHPOINT: 127 // The size of the patchpoint intrinsic is the number of bytes requested 128 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 129 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 130 break; 131 case TargetOpcode::STATEPOINT: 132 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 133 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 134 // No patch bytes means a normal call inst is emitted 135 if (NumBytes == 0) 136 NumBytes = 4; 137 break; 138 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 139 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER 140 // instructions are expanded to the specified number of NOPs. Otherwise, 141 // they are expanded to 36-byte XRay sleds. 142 NumBytes = 143 F.getFnAttributeAsParsedInteger("patchable-function-entry", 9) * 4; 144 break; 145 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 146 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: 147 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 148 NumBytes = 36; 149 break; 150 case TargetOpcode::PATCHABLE_EVENT_CALL: 151 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment). 152 NumBytes = 24; 153 break; 154 155 case AArch64::SPACE: 156 NumBytes = MI.getOperand(1).getImm(); 157 break; 158 case TargetOpcode::BUNDLE: 159 NumBytes = getInstBundleLength(MI); 160 break; 161 } 162 163 return NumBytes; 164 } 165 166 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 167 unsigned Size = 0; 168 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 169 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 170 while (++I != E && I->isInsideBundle()) { 171 assert(!I->isBundle() && "No nested bundle!"); 172 Size += getInstSizeInBytes(*I); 173 } 174 return Size; 175 } 176 177 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 178 SmallVectorImpl<MachineOperand> &Cond) { 179 // Block ends with fall-through condbranch. 180 switch (LastInst->getOpcode()) { 181 default: 182 llvm_unreachable("Unknown branch instruction?"); 183 case AArch64::Bcc: 184 Target = LastInst->getOperand(1).getMBB(); 185 Cond.push_back(LastInst->getOperand(0)); 186 break; 187 case AArch64::CBZW: 188 case AArch64::CBZX: 189 case AArch64::CBNZW: 190 case AArch64::CBNZX: 191 Target = LastInst->getOperand(1).getMBB(); 192 Cond.push_back(MachineOperand::CreateImm(-1)); 193 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 194 Cond.push_back(LastInst->getOperand(0)); 195 break; 196 case AArch64::TBZW: 197 case AArch64::TBZX: 198 case AArch64::TBNZW: 199 case AArch64::TBNZX: 200 Target = LastInst->getOperand(2).getMBB(); 201 Cond.push_back(MachineOperand::CreateImm(-1)); 202 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 203 Cond.push_back(LastInst->getOperand(0)); 204 Cond.push_back(LastInst->getOperand(1)); 205 } 206 } 207 208 static unsigned getBranchDisplacementBits(unsigned Opc) { 209 switch (Opc) { 210 default: 211 llvm_unreachable("unexpected opcode!"); 212 case AArch64::B: 213 return BDisplacementBits; 214 case AArch64::TBNZW: 215 case AArch64::TBZW: 216 case AArch64::TBNZX: 217 case AArch64::TBZX: 218 return TBZDisplacementBits; 219 case AArch64::CBNZW: 220 case AArch64::CBZW: 221 case AArch64::CBNZX: 222 case AArch64::CBZX: 223 return CBZDisplacementBits; 224 case AArch64::Bcc: 225 return BCCDisplacementBits; 226 } 227 } 228 229 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 230 int64_t BrOffset) const { 231 unsigned Bits = getBranchDisplacementBits(BranchOp); 232 assert(Bits >= 3 && "max branch displacement must be enough to jump" 233 "over conditional branch expansion"); 234 return isIntN(Bits, BrOffset / 4); 235 } 236 237 MachineBasicBlock * 238 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 239 switch (MI.getOpcode()) { 240 default: 241 llvm_unreachable("unexpected opcode!"); 242 case AArch64::B: 243 return MI.getOperand(0).getMBB(); 244 case AArch64::TBZW: 245 case AArch64::TBNZW: 246 case AArch64::TBZX: 247 case AArch64::TBNZX: 248 return MI.getOperand(2).getMBB(); 249 case AArch64::CBZW: 250 case AArch64::CBNZW: 251 case AArch64::CBZX: 252 case AArch64::CBNZX: 253 case AArch64::Bcc: 254 return MI.getOperand(1).getMBB(); 255 } 256 } 257 258 void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 259 MachineBasicBlock &NewDestBB, 260 MachineBasicBlock &RestoreBB, 261 const DebugLoc &DL, 262 int64_t BrOffset, 263 RegScavenger *RS) const { 264 assert(RS && "RegScavenger required for long branching"); 265 assert(MBB.empty() && 266 "new block should be inserted for expanding unconditional branch"); 267 assert(MBB.pred_size() == 1); 268 assert(RestoreBB.empty() && 269 "restore block should be inserted for restoring clobbered registers"); 270 271 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) { 272 // Offsets outside of the signed 33-bit range are not supported for ADRP + 273 // ADD. 274 if (!isInt<33>(BrOffset)) 275 report_fatal_error( 276 "Branch offsets outside of the signed 33-bit range not supported"); 277 278 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg) 279 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE); 280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg) 281 .addReg(Reg) 282 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC) 283 .addImm(0); 284 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg); 285 }; 286 287 RS->enterBasicBlockEnd(MBB); 288 // If X16 is unused, we can rely on the linker to insert a range extension 289 // thunk if NewDestBB is out of range of a single B instruction. 290 constexpr Register Reg = AArch64::X16; 291 if (!RS->isRegUsed(Reg)) { 292 insertUnconditionalBranch(MBB, &NewDestBB, DL); 293 RS->setRegUsed(Reg); 294 return; 295 } 296 297 // If there's a free register and it's worth inflating the code size, 298 // manually insert the indirect branch. 299 Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass); 300 if (Scavenged != AArch64::NoRegister && 301 MBB.getSectionID() == MBBSectionID::ColdSectionID) { 302 buildIndirectBranch(Scavenged, NewDestBB); 303 RS->setRegUsed(Scavenged); 304 return; 305 } 306 307 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible 308 // with red zones. 309 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>(); 310 if (!AFI || AFI->hasRedZone().value_or(true)) 311 report_fatal_error( 312 "Unable to insert indirect branch inside function that has red zone"); 313 314 // Otherwise, spill X16 and defer range extension to the linker. 315 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre)) 316 .addReg(AArch64::SP, RegState::Define) 317 .addReg(Reg) 318 .addReg(AArch64::SP) 319 .addImm(-16); 320 321 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB); 322 323 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost)) 324 .addReg(AArch64::SP, RegState::Define) 325 .addReg(Reg, RegState::Define) 326 .addReg(AArch64::SP) 327 .addImm(16); 328 } 329 330 // Branch analysis. 331 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 332 MachineBasicBlock *&TBB, 333 MachineBasicBlock *&FBB, 334 SmallVectorImpl<MachineOperand> &Cond, 335 bool AllowModify) const { 336 // If the block has no terminators, it just falls into the block after it. 337 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 338 if (I == MBB.end()) 339 return false; 340 341 // Skip over SpeculationBarrierEndBB terminators 342 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 343 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 344 --I; 345 } 346 347 if (!isUnpredicatedTerminator(*I)) 348 return false; 349 350 // Get the last instruction in the block. 351 MachineInstr *LastInst = &*I; 352 353 // If there is only one terminator instruction, process it. 354 unsigned LastOpc = LastInst->getOpcode(); 355 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 356 if (isUncondBranchOpcode(LastOpc)) { 357 TBB = LastInst->getOperand(0).getMBB(); 358 return false; 359 } 360 if (isCondBranchOpcode(LastOpc)) { 361 // Block ends with fall-through condbranch. 362 parseCondBranch(LastInst, TBB, Cond); 363 return false; 364 } 365 return true; // Can't handle indirect branch. 366 } 367 368 // Get the instruction before it if it is a terminator. 369 MachineInstr *SecondLastInst = &*I; 370 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 371 372 // If AllowModify is true and the block ends with two or more unconditional 373 // branches, delete all but the first unconditional branch. 374 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 375 while (isUncondBranchOpcode(SecondLastOpc)) { 376 LastInst->eraseFromParent(); 377 LastInst = SecondLastInst; 378 LastOpc = LastInst->getOpcode(); 379 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 380 // Return now the only terminator is an unconditional branch. 381 TBB = LastInst->getOperand(0).getMBB(); 382 return false; 383 } 384 SecondLastInst = &*I; 385 SecondLastOpc = SecondLastInst->getOpcode(); 386 } 387 } 388 389 // If we're allowed to modify and the block ends in a unconditional branch 390 // which could simply fallthrough, remove the branch. (Note: This case only 391 // matters when we can't understand the whole sequence, otherwise it's also 392 // handled by BranchFolding.cpp.) 393 if (AllowModify && isUncondBranchOpcode(LastOpc) && 394 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 395 LastInst->eraseFromParent(); 396 LastInst = SecondLastInst; 397 LastOpc = LastInst->getOpcode(); 398 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 399 assert(!isUncondBranchOpcode(LastOpc) && 400 "unreachable unconditional branches removed above"); 401 402 if (isCondBranchOpcode(LastOpc)) { 403 // Block ends with fall-through condbranch. 404 parseCondBranch(LastInst, TBB, Cond); 405 return false; 406 } 407 return true; // Can't handle indirect branch. 408 } 409 SecondLastInst = &*I; 410 SecondLastOpc = SecondLastInst->getOpcode(); 411 } 412 413 // If there are three terminators, we don't know what sort of block this is. 414 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 415 return true; 416 417 // If the block ends with a B and a Bcc, handle it. 418 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 419 parseCondBranch(SecondLastInst, TBB, Cond); 420 FBB = LastInst->getOperand(0).getMBB(); 421 return false; 422 } 423 424 // If the block ends with two unconditional branches, handle it. The second 425 // one is not executed, so remove it. 426 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 427 TBB = SecondLastInst->getOperand(0).getMBB(); 428 I = LastInst; 429 if (AllowModify) 430 I->eraseFromParent(); 431 return false; 432 } 433 434 // ...likewise if it ends with an indirect branch followed by an unconditional 435 // branch. 436 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 437 I = LastInst; 438 if (AllowModify) 439 I->eraseFromParent(); 440 return true; 441 } 442 443 // Otherwise, can't handle this. 444 return true; 445 } 446 447 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 448 MachineBranchPredicate &MBP, 449 bool AllowModify) const { 450 // For the moment, handle only a block which ends with a cb(n)zx followed by 451 // a fallthrough. Why this? Because it is a common form. 452 // TODO: Should we handle b.cc? 453 454 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 455 if (I == MBB.end()) 456 return true; 457 458 // Skip over SpeculationBarrierEndBB terminators 459 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 460 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 461 --I; 462 } 463 464 if (!isUnpredicatedTerminator(*I)) 465 return true; 466 467 // Get the last instruction in the block. 468 MachineInstr *LastInst = &*I; 469 unsigned LastOpc = LastInst->getOpcode(); 470 if (!isCondBranchOpcode(LastOpc)) 471 return true; 472 473 switch (LastOpc) { 474 default: 475 return true; 476 case AArch64::CBZW: 477 case AArch64::CBZX: 478 case AArch64::CBNZW: 479 case AArch64::CBNZX: 480 break; 481 }; 482 483 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 484 assert(MBP.TrueDest && "expected!"); 485 MBP.FalseDest = MBB.getNextNode(); 486 487 MBP.ConditionDef = nullptr; 488 MBP.SingleUseCondition = false; 489 490 MBP.LHS = LastInst->getOperand(0); 491 MBP.RHS = MachineOperand::CreateImm(0); 492 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 493 : MachineBranchPredicate::PRED_EQ; 494 return false; 495 } 496 497 bool AArch64InstrInfo::reverseBranchCondition( 498 SmallVectorImpl<MachineOperand> &Cond) const { 499 if (Cond[0].getImm() != -1) { 500 // Regular Bcc 501 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 502 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 503 } else { 504 // Folded compare-and-branch 505 switch (Cond[1].getImm()) { 506 default: 507 llvm_unreachable("Unknown conditional branch!"); 508 case AArch64::CBZW: 509 Cond[1].setImm(AArch64::CBNZW); 510 break; 511 case AArch64::CBNZW: 512 Cond[1].setImm(AArch64::CBZW); 513 break; 514 case AArch64::CBZX: 515 Cond[1].setImm(AArch64::CBNZX); 516 break; 517 case AArch64::CBNZX: 518 Cond[1].setImm(AArch64::CBZX); 519 break; 520 case AArch64::TBZW: 521 Cond[1].setImm(AArch64::TBNZW); 522 break; 523 case AArch64::TBNZW: 524 Cond[1].setImm(AArch64::TBZW); 525 break; 526 case AArch64::TBZX: 527 Cond[1].setImm(AArch64::TBNZX); 528 break; 529 case AArch64::TBNZX: 530 Cond[1].setImm(AArch64::TBZX); 531 break; 532 } 533 } 534 535 return false; 536 } 537 538 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 539 int *BytesRemoved) const { 540 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 541 if (I == MBB.end()) 542 return 0; 543 544 if (!isUncondBranchOpcode(I->getOpcode()) && 545 !isCondBranchOpcode(I->getOpcode())) 546 return 0; 547 548 // Remove the branch. 549 I->eraseFromParent(); 550 551 I = MBB.end(); 552 553 if (I == MBB.begin()) { 554 if (BytesRemoved) 555 *BytesRemoved = 4; 556 return 1; 557 } 558 --I; 559 if (!isCondBranchOpcode(I->getOpcode())) { 560 if (BytesRemoved) 561 *BytesRemoved = 4; 562 return 1; 563 } 564 565 // Remove the branch. 566 I->eraseFromParent(); 567 if (BytesRemoved) 568 *BytesRemoved = 8; 569 570 return 2; 571 } 572 573 void AArch64InstrInfo::instantiateCondBranch( 574 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 575 ArrayRef<MachineOperand> Cond) const { 576 if (Cond[0].getImm() != -1) { 577 // Regular Bcc 578 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 579 } else { 580 // Folded compare-and-branch 581 // Note that we use addOperand instead of addReg to keep the flags. 582 const MachineInstrBuilder MIB = 583 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 584 if (Cond.size() > 3) 585 MIB.addImm(Cond[3].getImm()); 586 MIB.addMBB(TBB); 587 } 588 } 589 590 unsigned AArch64InstrInfo::insertBranch( 591 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 592 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 593 // Shouldn't be a fall through. 594 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 595 596 if (!FBB) { 597 if (Cond.empty()) // Unconditional branch? 598 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 599 else 600 instantiateCondBranch(MBB, DL, TBB, Cond); 601 602 if (BytesAdded) 603 *BytesAdded = 4; 604 605 return 1; 606 } 607 608 // Two-way conditional branch. 609 instantiateCondBranch(MBB, DL, TBB, Cond); 610 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 611 612 if (BytesAdded) 613 *BytesAdded = 8; 614 615 return 2; 616 } 617 618 // Find the original register that VReg is copied from. 619 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 620 while (Register::isVirtualRegister(VReg)) { 621 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 622 if (!DefMI->isFullCopy()) 623 return VReg; 624 VReg = DefMI->getOperand(1).getReg(); 625 } 626 return VReg; 627 } 628 629 // Determine if VReg is defined by an instruction that can be folded into a 630 // csel instruction. If so, return the folded opcode, and the replacement 631 // register. 632 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 633 unsigned *NewVReg = nullptr) { 634 VReg = removeCopies(MRI, VReg); 635 if (!Register::isVirtualRegister(VReg)) 636 return 0; 637 638 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 639 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 640 unsigned Opc = 0; 641 unsigned SrcOpNum = 0; 642 switch (DefMI->getOpcode()) { 643 case AArch64::ADDSXri: 644 case AArch64::ADDSWri: 645 // if NZCV is used, do not fold. 646 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 647 return 0; 648 // fall-through to ADDXri and ADDWri. 649 [[fallthrough]]; 650 case AArch64::ADDXri: 651 case AArch64::ADDWri: 652 // add x, 1 -> csinc. 653 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 654 DefMI->getOperand(3).getImm() != 0) 655 return 0; 656 SrcOpNum = 1; 657 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 658 break; 659 660 case AArch64::ORNXrr: 661 case AArch64::ORNWrr: { 662 // not x -> csinv, represented as orn dst, xzr, src. 663 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 664 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 665 return 0; 666 SrcOpNum = 2; 667 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 668 break; 669 } 670 671 case AArch64::SUBSXrr: 672 case AArch64::SUBSWrr: 673 // if NZCV is used, do not fold. 674 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 675 return 0; 676 // fall-through to SUBXrr and SUBWrr. 677 [[fallthrough]]; 678 case AArch64::SUBXrr: 679 case AArch64::SUBWrr: { 680 // neg x -> csneg, represented as sub dst, xzr, src. 681 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 682 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 683 return 0; 684 SrcOpNum = 2; 685 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 686 break; 687 } 688 default: 689 return 0; 690 } 691 assert(Opc && SrcOpNum && "Missing parameters"); 692 693 if (NewVReg) 694 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 695 return Opc; 696 } 697 698 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 699 ArrayRef<MachineOperand> Cond, 700 Register DstReg, Register TrueReg, 701 Register FalseReg, int &CondCycles, 702 int &TrueCycles, 703 int &FalseCycles) const { 704 // Check register classes. 705 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 706 const TargetRegisterClass *RC = 707 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 708 if (!RC) 709 return false; 710 711 // Also need to check the dest regclass, in case we're trying to optimize 712 // something like: 713 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 714 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 715 return false; 716 717 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 718 unsigned ExtraCondLat = Cond.size() != 1; 719 720 // GPRs are handled by csel. 721 // FIXME: Fold in x+1, -x, and ~x when applicable. 722 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 723 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 724 // Single-cycle csel, csinc, csinv, and csneg. 725 CondCycles = 1 + ExtraCondLat; 726 TrueCycles = FalseCycles = 1; 727 if (canFoldIntoCSel(MRI, TrueReg)) 728 TrueCycles = 0; 729 else if (canFoldIntoCSel(MRI, FalseReg)) 730 FalseCycles = 0; 731 return true; 732 } 733 734 // Scalar floating point is handled by fcsel. 735 // FIXME: Form fabs, fmin, and fmax when applicable. 736 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 737 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 738 CondCycles = 5 + ExtraCondLat; 739 TrueCycles = FalseCycles = 2; 740 return true; 741 } 742 743 // Can't do vectors. 744 return false; 745 } 746 747 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 748 MachineBasicBlock::iterator I, 749 const DebugLoc &DL, Register DstReg, 750 ArrayRef<MachineOperand> Cond, 751 Register TrueReg, Register FalseReg) const { 752 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 753 754 // Parse the condition code, see parseCondBranch() above. 755 AArch64CC::CondCode CC; 756 switch (Cond.size()) { 757 default: 758 llvm_unreachable("Unknown condition opcode in Cond"); 759 case 1: // b.cc 760 CC = AArch64CC::CondCode(Cond[0].getImm()); 761 break; 762 case 3: { // cbz/cbnz 763 // We must insert a compare against 0. 764 bool Is64Bit; 765 switch (Cond[1].getImm()) { 766 default: 767 llvm_unreachable("Unknown branch opcode in Cond"); 768 case AArch64::CBZW: 769 Is64Bit = false; 770 CC = AArch64CC::EQ; 771 break; 772 case AArch64::CBZX: 773 Is64Bit = true; 774 CC = AArch64CC::EQ; 775 break; 776 case AArch64::CBNZW: 777 Is64Bit = false; 778 CC = AArch64CC::NE; 779 break; 780 case AArch64::CBNZX: 781 Is64Bit = true; 782 CC = AArch64CC::NE; 783 break; 784 } 785 Register SrcReg = Cond[2].getReg(); 786 if (Is64Bit) { 787 // cmp reg, #0 is actually subs xzr, reg, #0. 788 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 789 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 790 .addReg(SrcReg) 791 .addImm(0) 792 .addImm(0); 793 } else { 794 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 795 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 796 .addReg(SrcReg) 797 .addImm(0) 798 .addImm(0); 799 } 800 break; 801 } 802 case 4: { // tbz/tbnz 803 // We must insert a tst instruction. 804 switch (Cond[1].getImm()) { 805 default: 806 llvm_unreachable("Unknown branch opcode in Cond"); 807 case AArch64::TBZW: 808 case AArch64::TBZX: 809 CC = AArch64CC::EQ; 810 break; 811 case AArch64::TBNZW: 812 case AArch64::TBNZX: 813 CC = AArch64CC::NE; 814 break; 815 } 816 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 817 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 818 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 819 .addReg(Cond[2].getReg()) 820 .addImm( 821 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 822 else 823 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 824 .addReg(Cond[2].getReg()) 825 .addImm( 826 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 827 break; 828 } 829 } 830 831 unsigned Opc = 0; 832 const TargetRegisterClass *RC = nullptr; 833 bool TryFold = false; 834 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 835 RC = &AArch64::GPR64RegClass; 836 Opc = AArch64::CSELXr; 837 TryFold = true; 838 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 839 RC = &AArch64::GPR32RegClass; 840 Opc = AArch64::CSELWr; 841 TryFold = true; 842 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 843 RC = &AArch64::FPR64RegClass; 844 Opc = AArch64::FCSELDrrr; 845 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 846 RC = &AArch64::FPR32RegClass; 847 Opc = AArch64::FCSELSrrr; 848 } 849 assert(RC && "Unsupported regclass"); 850 851 // Try folding simple instructions into the csel. 852 if (TryFold) { 853 unsigned NewVReg = 0; 854 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 855 if (FoldedOpc) { 856 // The folded opcodes csinc, csinc and csneg apply the operation to 857 // FalseReg, so we need to invert the condition. 858 CC = AArch64CC::getInvertedCondCode(CC); 859 TrueReg = FalseReg; 860 } else 861 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 862 863 // Fold the operation. Leave any dead instructions for DCE to clean up. 864 if (FoldedOpc) { 865 FalseReg = NewVReg; 866 Opc = FoldedOpc; 867 // The extends the live range of NewVReg. 868 MRI.clearKillFlags(NewVReg); 869 } 870 } 871 872 // Pull all virtual register into the appropriate class. 873 MRI.constrainRegClass(TrueReg, RC); 874 MRI.constrainRegClass(FalseReg, RC); 875 876 // Insert the csel. 877 BuildMI(MBB, I, DL, get(Opc), DstReg) 878 .addReg(TrueReg) 879 .addReg(FalseReg) 880 .addImm(CC); 881 } 882 883 // Return true if Imm can be loaded into a register by a "cheap" sequence of 884 // instructions. For now, "cheap" means at most two instructions. 885 static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) { 886 if (BitSize == 32) 887 return true; 888 889 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed"); 890 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(1).getImm()); 891 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is; 892 AArch64_IMM::expandMOVImm(Imm, BitSize, Is); 893 894 return Is.size() <= 2; 895 } 896 897 // FIXME: this implementation should be micro-architecture dependent, so a 898 // micro-architecture target hook should be introduced here in future. 899 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 900 if (Subtarget.hasExynosCheapAsMoveHandling()) { 901 if (isExynosCheapAsMove(MI)) 902 return true; 903 return MI.isAsCheapAsAMove(); 904 } 905 906 switch (MI.getOpcode()) { 907 default: 908 return MI.isAsCheapAsAMove(); 909 910 case AArch64::ADDWrs: 911 case AArch64::ADDXrs: 912 case AArch64::SUBWrs: 913 case AArch64::SUBXrs: 914 return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4; 915 916 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 917 // ORRXri, it is as cheap as MOV. 918 // Likewise if it can be expanded to MOVZ/MOVN/MOVK. 919 case AArch64::MOVi32imm: 920 return isCheapImmediate(MI, 32); 921 case AArch64::MOVi64imm: 922 return isCheapImmediate(MI, 64); 923 } 924 } 925 926 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 927 switch (MI.getOpcode()) { 928 default: 929 return false; 930 931 case AArch64::ADDWrs: 932 case AArch64::ADDXrs: 933 case AArch64::ADDSWrs: 934 case AArch64::ADDSXrs: { 935 unsigned Imm = MI.getOperand(3).getImm(); 936 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 937 if (ShiftVal == 0) 938 return true; 939 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 940 } 941 942 case AArch64::ADDWrx: 943 case AArch64::ADDXrx: 944 case AArch64::ADDXrx64: 945 case AArch64::ADDSWrx: 946 case AArch64::ADDSXrx: 947 case AArch64::ADDSXrx64: { 948 unsigned Imm = MI.getOperand(3).getImm(); 949 switch (AArch64_AM::getArithExtendType(Imm)) { 950 default: 951 return false; 952 case AArch64_AM::UXTB: 953 case AArch64_AM::UXTH: 954 case AArch64_AM::UXTW: 955 case AArch64_AM::UXTX: 956 return AArch64_AM::getArithShiftValue(Imm) <= 4; 957 } 958 } 959 960 case AArch64::SUBWrs: 961 case AArch64::SUBSWrs: { 962 unsigned Imm = MI.getOperand(3).getImm(); 963 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 964 return ShiftVal == 0 || 965 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 966 } 967 968 case AArch64::SUBXrs: 969 case AArch64::SUBSXrs: { 970 unsigned Imm = MI.getOperand(3).getImm(); 971 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 972 return ShiftVal == 0 || 973 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 974 } 975 976 case AArch64::SUBWrx: 977 case AArch64::SUBXrx: 978 case AArch64::SUBXrx64: 979 case AArch64::SUBSWrx: 980 case AArch64::SUBSXrx: 981 case AArch64::SUBSXrx64: { 982 unsigned Imm = MI.getOperand(3).getImm(); 983 switch (AArch64_AM::getArithExtendType(Imm)) { 984 default: 985 return false; 986 case AArch64_AM::UXTB: 987 case AArch64_AM::UXTH: 988 case AArch64_AM::UXTW: 989 case AArch64_AM::UXTX: 990 return AArch64_AM::getArithShiftValue(Imm) == 0; 991 } 992 } 993 994 case AArch64::LDRBBroW: 995 case AArch64::LDRBBroX: 996 case AArch64::LDRBroW: 997 case AArch64::LDRBroX: 998 case AArch64::LDRDroW: 999 case AArch64::LDRDroX: 1000 case AArch64::LDRHHroW: 1001 case AArch64::LDRHHroX: 1002 case AArch64::LDRHroW: 1003 case AArch64::LDRHroX: 1004 case AArch64::LDRQroW: 1005 case AArch64::LDRQroX: 1006 case AArch64::LDRSBWroW: 1007 case AArch64::LDRSBWroX: 1008 case AArch64::LDRSBXroW: 1009 case AArch64::LDRSBXroX: 1010 case AArch64::LDRSHWroW: 1011 case AArch64::LDRSHWroX: 1012 case AArch64::LDRSHXroW: 1013 case AArch64::LDRSHXroX: 1014 case AArch64::LDRSWroW: 1015 case AArch64::LDRSWroX: 1016 case AArch64::LDRSroW: 1017 case AArch64::LDRSroX: 1018 case AArch64::LDRWroW: 1019 case AArch64::LDRWroX: 1020 case AArch64::LDRXroW: 1021 case AArch64::LDRXroX: 1022 case AArch64::PRFMroW: 1023 case AArch64::PRFMroX: 1024 case AArch64::STRBBroW: 1025 case AArch64::STRBBroX: 1026 case AArch64::STRBroW: 1027 case AArch64::STRBroX: 1028 case AArch64::STRDroW: 1029 case AArch64::STRDroX: 1030 case AArch64::STRHHroW: 1031 case AArch64::STRHHroX: 1032 case AArch64::STRHroW: 1033 case AArch64::STRHroX: 1034 case AArch64::STRQroW: 1035 case AArch64::STRQroX: 1036 case AArch64::STRSroW: 1037 case AArch64::STRSroX: 1038 case AArch64::STRWroW: 1039 case AArch64::STRWroX: 1040 case AArch64::STRXroW: 1041 case AArch64::STRXroX: { 1042 unsigned IsSigned = MI.getOperand(3).getImm(); 1043 return !IsSigned; 1044 } 1045 } 1046 } 1047 1048 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1049 unsigned Opc = MI.getOpcode(); 1050 switch (Opc) { 1051 default: 1052 return false; 1053 case AArch64::SEH_StackAlloc: 1054 case AArch64::SEH_SaveFPLR: 1055 case AArch64::SEH_SaveFPLR_X: 1056 case AArch64::SEH_SaveReg: 1057 case AArch64::SEH_SaveReg_X: 1058 case AArch64::SEH_SaveRegP: 1059 case AArch64::SEH_SaveRegP_X: 1060 case AArch64::SEH_SaveFReg: 1061 case AArch64::SEH_SaveFReg_X: 1062 case AArch64::SEH_SaveFRegP: 1063 case AArch64::SEH_SaveFRegP_X: 1064 case AArch64::SEH_SetFP: 1065 case AArch64::SEH_AddFP: 1066 case AArch64::SEH_Nop: 1067 case AArch64::SEH_PrologEnd: 1068 case AArch64::SEH_EpilogStart: 1069 case AArch64::SEH_EpilogEnd: 1070 case AArch64::SEH_PACSignLR: 1071 case AArch64::SEH_SaveAnyRegQP: 1072 case AArch64::SEH_SaveAnyRegQPX: 1073 return true; 1074 } 1075 } 1076 1077 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1078 Register &SrcReg, Register &DstReg, 1079 unsigned &SubIdx) const { 1080 switch (MI.getOpcode()) { 1081 default: 1082 return false; 1083 case AArch64::SBFMXri: // aka sxtw 1084 case AArch64::UBFMXri: // aka uxtw 1085 // Check for the 32 -> 64 bit extension case, these instructions can do 1086 // much more. 1087 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1088 return false; 1089 // This is a signed or unsigned 32 -> 64 bit extension. 1090 SrcReg = MI.getOperand(1).getReg(); 1091 DstReg = MI.getOperand(0).getReg(); 1092 SubIdx = AArch64::sub_32; 1093 return true; 1094 } 1095 } 1096 1097 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1098 const MachineInstr &MIa, const MachineInstr &MIb) const { 1099 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1100 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1101 int64_t OffsetA = 0, OffsetB = 0; 1102 TypeSize WidthA(0, false), WidthB(0, false); 1103 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1104 1105 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1106 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1107 1108 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1109 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1110 return false; 1111 1112 // Retrieve the base, offset from the base and width. Width 1113 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1114 // base are identical, and the offset of a lower memory access + 1115 // the width doesn't overlap the offset of a higher memory access, 1116 // then the memory accesses are different. 1117 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1118 // are assumed to have the same scale (vscale). 1119 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1120 WidthA, TRI) && 1121 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1122 WidthB, TRI)) { 1123 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1124 OffsetAIsScalable == OffsetBIsScalable) { 1125 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1126 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1127 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1128 if (LowWidth.isScalable() == OffsetAIsScalable && 1129 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset) 1130 return true; 1131 } 1132 } 1133 return false; 1134 } 1135 1136 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1137 const MachineBasicBlock *MBB, 1138 const MachineFunction &MF) const { 1139 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1140 return true; 1141 1142 // Do not move an instruction that can be recognized as a branch target. 1143 if (hasBTISemantics(MI)) 1144 return true; 1145 1146 switch (MI.getOpcode()) { 1147 case AArch64::HINT: 1148 // CSDB hints are scheduling barriers. 1149 if (MI.getOperand(0).getImm() == 0x14) 1150 return true; 1151 break; 1152 case AArch64::DSB: 1153 case AArch64::ISB: 1154 // DSB and ISB also are scheduling barriers. 1155 return true; 1156 case AArch64::MSRpstatesvcrImm1: 1157 // SMSTART and SMSTOP are also scheduling barriers. 1158 return true; 1159 default:; 1160 } 1161 if (isSEHInstruction(MI)) 1162 return true; 1163 auto Next = std::next(MI.getIterator()); 1164 return Next != MBB->end() && Next->isCFIInstruction(); 1165 } 1166 1167 /// analyzeCompare - For a comparison instruction, return the source registers 1168 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1169 /// Return true if the comparison instruction can be analyzed. 1170 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1171 Register &SrcReg2, int64_t &CmpMask, 1172 int64_t &CmpValue) const { 1173 // The first operand can be a frame index where we'd normally expect a 1174 // register. 1175 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1176 if (!MI.getOperand(1).isReg()) 1177 return false; 1178 1179 switch (MI.getOpcode()) { 1180 default: 1181 break; 1182 case AArch64::PTEST_PP: 1183 case AArch64::PTEST_PP_ANY: 1184 SrcReg = MI.getOperand(0).getReg(); 1185 SrcReg2 = MI.getOperand(1).getReg(); 1186 // Not sure about the mask and value for now... 1187 CmpMask = ~0; 1188 CmpValue = 0; 1189 return true; 1190 case AArch64::SUBSWrr: 1191 case AArch64::SUBSWrs: 1192 case AArch64::SUBSWrx: 1193 case AArch64::SUBSXrr: 1194 case AArch64::SUBSXrs: 1195 case AArch64::SUBSXrx: 1196 case AArch64::ADDSWrr: 1197 case AArch64::ADDSWrs: 1198 case AArch64::ADDSWrx: 1199 case AArch64::ADDSXrr: 1200 case AArch64::ADDSXrs: 1201 case AArch64::ADDSXrx: 1202 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1203 SrcReg = MI.getOperand(1).getReg(); 1204 SrcReg2 = MI.getOperand(2).getReg(); 1205 CmpMask = ~0; 1206 CmpValue = 0; 1207 return true; 1208 case AArch64::SUBSWri: 1209 case AArch64::ADDSWri: 1210 case AArch64::SUBSXri: 1211 case AArch64::ADDSXri: 1212 SrcReg = MI.getOperand(1).getReg(); 1213 SrcReg2 = 0; 1214 CmpMask = ~0; 1215 CmpValue = MI.getOperand(2).getImm(); 1216 return true; 1217 case AArch64::ANDSWri: 1218 case AArch64::ANDSXri: 1219 // ANDS does not use the same encoding scheme as the others xxxS 1220 // instructions. 1221 SrcReg = MI.getOperand(1).getReg(); 1222 SrcReg2 = 0; 1223 CmpMask = ~0; 1224 CmpValue = AArch64_AM::decodeLogicalImmediate( 1225 MI.getOperand(2).getImm(), 1226 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1227 return true; 1228 } 1229 1230 return false; 1231 } 1232 1233 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1234 MachineBasicBlock *MBB = Instr.getParent(); 1235 assert(MBB && "Can't get MachineBasicBlock here"); 1236 MachineFunction *MF = MBB->getParent(); 1237 assert(MF && "Can't get MachineFunction here"); 1238 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1239 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1240 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1241 1242 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1243 ++OpIdx) { 1244 MachineOperand &MO = Instr.getOperand(OpIdx); 1245 const TargetRegisterClass *OpRegCstraints = 1246 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1247 1248 // If there's no constraint, there's nothing to do. 1249 if (!OpRegCstraints) 1250 continue; 1251 // If the operand is a frame index, there's nothing to do here. 1252 // A frame index operand will resolve correctly during PEI. 1253 if (MO.isFI()) 1254 continue; 1255 1256 assert(MO.isReg() && 1257 "Operand has register constraints without being a register!"); 1258 1259 Register Reg = MO.getReg(); 1260 if (Reg.isPhysical()) { 1261 if (!OpRegCstraints->contains(Reg)) 1262 return false; 1263 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1264 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1265 return false; 1266 } 1267 1268 return true; 1269 } 1270 1271 /// Return the opcode that does not set flags when possible - otherwise 1272 /// return the original opcode. The caller is responsible to do the actual 1273 /// substitution and legality checking. 1274 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1275 // Don't convert all compare instructions, because for some the zero register 1276 // encoding becomes the sp register. 1277 bool MIDefinesZeroReg = false; 1278 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1279 MIDefinesZeroReg = true; 1280 1281 switch (MI.getOpcode()) { 1282 default: 1283 return MI.getOpcode(); 1284 case AArch64::ADDSWrr: 1285 return AArch64::ADDWrr; 1286 case AArch64::ADDSWri: 1287 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1288 case AArch64::ADDSWrs: 1289 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1290 case AArch64::ADDSWrx: 1291 return AArch64::ADDWrx; 1292 case AArch64::ADDSXrr: 1293 return AArch64::ADDXrr; 1294 case AArch64::ADDSXri: 1295 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1296 case AArch64::ADDSXrs: 1297 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1298 case AArch64::ADDSXrx: 1299 return AArch64::ADDXrx; 1300 case AArch64::SUBSWrr: 1301 return AArch64::SUBWrr; 1302 case AArch64::SUBSWri: 1303 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1304 case AArch64::SUBSWrs: 1305 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1306 case AArch64::SUBSWrx: 1307 return AArch64::SUBWrx; 1308 case AArch64::SUBSXrr: 1309 return AArch64::SUBXrr; 1310 case AArch64::SUBSXri: 1311 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1312 case AArch64::SUBSXrs: 1313 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1314 case AArch64::SUBSXrx: 1315 return AArch64::SUBXrx; 1316 } 1317 } 1318 1319 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1320 1321 /// True when condition flags are accessed (either by writing or reading) 1322 /// on the instruction trace starting at From and ending at To. 1323 /// 1324 /// Note: If From and To are from different blocks it's assumed CC are accessed 1325 /// on the path. 1326 static bool areCFlagsAccessedBetweenInstrs( 1327 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1328 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1329 // Early exit if To is at the beginning of the BB. 1330 if (To == To->getParent()->begin()) 1331 return true; 1332 1333 // Check whether the instructions are in the same basic block 1334 // If not, assume the condition flags might get modified somewhere. 1335 if (To->getParent() != From->getParent()) 1336 return true; 1337 1338 // From must be above To. 1339 assert(std::any_of( 1340 ++To.getReverse(), To->getParent()->rend(), 1341 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1342 1343 // We iterate backward starting at \p To until we hit \p From. 1344 for (const MachineInstr &Instr : 1345 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1346 if (((AccessToCheck & AK_Write) && 1347 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1348 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1349 return true; 1350 } 1351 return false; 1352 } 1353 1354 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1355 /// operation which could set the flags in an identical manner 1356 bool AArch64InstrInfo::optimizePTestInstr( 1357 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1358 const MachineRegisterInfo *MRI) const { 1359 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1360 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1361 auto NewOp = Pred->getOpcode(); 1362 bool OpChanged = false; 1363 1364 unsigned MaskOpcode = Mask->getOpcode(); 1365 unsigned PredOpcode = Pred->getOpcode(); 1366 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1367 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1368 1369 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && 1370 getElementSizeForOpcode(MaskOpcode) == 1371 getElementSizeForOpcode(PredOpcode) && 1372 Mask->getOperand(1).getImm() == 31) { 1373 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1374 // redundant since WHILE performs an implicit PTEST with an all active 1375 // mask. Must be an all active predicate of matching element size. 1376 1377 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1378 // PTEST_LIKE instruction uses the same all active mask and the element 1379 // size matches. If the PTEST has a condition of any then it is always 1380 // redundant. 1381 if (PredIsPTestLike) { 1382 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1383 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) 1384 return false; 1385 } 1386 1387 // Fallthough to simply remove the PTEST. 1388 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && 1389 PTest->getOpcode() == AArch64::PTEST_PP_ANY) { 1390 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1391 // instruction that sets the flags as PTEST would. This is only valid when 1392 // the condition is any. 1393 1394 // Fallthough to simply remove the PTEST. 1395 } else if (PredIsPTestLike) { 1396 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1397 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1398 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1399 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1400 // performed by the compare could consider fewer lanes for these element 1401 // sizes. 1402 // 1403 // For example, consider 1404 // 1405 // ptrue p0.b ; P0=1111-1111-1111-1111 1406 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1407 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1408 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1409 // ; ^ last active 1410 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1411 // ; ^ last active 1412 // 1413 // where the compare generates a canonical all active 32-bit predicate 1414 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1415 // active flag, whereas the PTEST instruction with the same mask doesn't. 1416 // For PTEST_ANY this doesn't apply as the flags in this case would be 1417 // identical regardless of element size. 1418 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1419 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1420 if ((Mask != PTestLikeMask) || 1421 (PredElementSize != AArch64::ElementSizeB && 1422 PTest->getOpcode() != AArch64::PTEST_PP_ANY)) 1423 return false; 1424 1425 // Fallthough to simply remove the PTEST. 1426 } else { 1427 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1428 // opcode so the PTEST becomes redundant. 1429 switch (PredOpcode) { 1430 case AArch64::AND_PPzPP: 1431 case AArch64::BIC_PPzPP: 1432 case AArch64::EOR_PPzPP: 1433 case AArch64::NAND_PPzPP: 1434 case AArch64::NOR_PPzPP: 1435 case AArch64::ORN_PPzPP: 1436 case AArch64::ORR_PPzPP: 1437 case AArch64::BRKA_PPzP: 1438 case AArch64::BRKPA_PPzPP: 1439 case AArch64::BRKB_PPzP: 1440 case AArch64::BRKPB_PPzPP: 1441 case AArch64::RDFFR_PPz: { 1442 // Check to see if our mask is the same. If not the resulting flag bits 1443 // may be different and we can't remove the ptest. 1444 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1445 if (Mask != PredMask) 1446 return false; 1447 break; 1448 } 1449 case AArch64::BRKN_PPzP: { 1450 // BRKN uses an all active implicit mask to set flags unlike the other 1451 // flag-setting instructions. 1452 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1453 if ((MaskOpcode != AArch64::PTRUE_B) || 1454 (Mask->getOperand(1).getImm() != 31)) 1455 return false; 1456 break; 1457 } 1458 case AArch64::PTRUE_B: 1459 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1460 break; 1461 default: 1462 // Bail out if we don't recognize the input 1463 return false; 1464 } 1465 1466 NewOp = convertToFlagSettingOpc(PredOpcode); 1467 OpChanged = true; 1468 } 1469 1470 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1471 1472 // If another instruction between Pred and PTest accesses flags, don't remove 1473 // the ptest or update the earlier instruction to modify them. 1474 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1475 return false; 1476 1477 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1478 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1479 // operand to be replaced with an equivalent instruction that also sets the 1480 // flags. 1481 Pred->setDesc(get(NewOp)); 1482 PTest->eraseFromParent(); 1483 if (OpChanged) { 1484 bool succeeded = UpdateOperandRegClass(*Pred); 1485 (void)succeeded; 1486 assert(succeeded && "Operands have incompatible register classes!"); 1487 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1488 } 1489 1490 // Ensure that the flags def is live. 1491 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1492 unsigned i = 0, e = Pred->getNumOperands(); 1493 for (; i != e; ++i) { 1494 MachineOperand &MO = Pred->getOperand(i); 1495 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1496 MO.setIsDead(false); 1497 break; 1498 } 1499 } 1500 } 1501 return true; 1502 } 1503 1504 /// Try to optimize a compare instruction. A compare instruction is an 1505 /// instruction which produces AArch64::NZCV. It can be truly compare 1506 /// instruction 1507 /// when there are no uses of its destination register. 1508 /// 1509 /// The following steps are tried in order: 1510 /// 1. Convert CmpInstr into an unconditional version. 1511 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1512 /// condition code or an instruction which can be converted into such an 1513 /// instruction. 1514 /// Only comparison with zero is supported. 1515 bool AArch64InstrInfo::optimizeCompareInstr( 1516 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1517 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1518 assert(CmpInstr.getParent()); 1519 assert(MRI); 1520 1521 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1522 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1523 if (DeadNZCVIdx != -1) { 1524 if (CmpInstr.definesRegister(AArch64::WZR) || 1525 CmpInstr.definesRegister(AArch64::XZR)) { 1526 CmpInstr.eraseFromParent(); 1527 return true; 1528 } 1529 unsigned Opc = CmpInstr.getOpcode(); 1530 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1531 if (NewOpc == Opc) 1532 return false; 1533 const MCInstrDesc &MCID = get(NewOpc); 1534 CmpInstr.setDesc(MCID); 1535 CmpInstr.removeOperand(DeadNZCVIdx); 1536 bool succeeded = UpdateOperandRegClass(CmpInstr); 1537 (void)succeeded; 1538 assert(succeeded && "Some operands reg class are incompatible!"); 1539 return true; 1540 } 1541 1542 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1543 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1544 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1545 1546 if (SrcReg2 != 0) 1547 return false; 1548 1549 // CmpInstr is a Compare instruction if destination register is not used. 1550 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1551 return false; 1552 1553 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1554 return true; 1555 return (CmpValue == 0 || CmpValue == 1) && 1556 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1557 } 1558 1559 /// Get opcode of S version of Instr. 1560 /// If Instr is S version its opcode is returned. 1561 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1562 /// or we are not interested in it. 1563 static unsigned sForm(MachineInstr &Instr) { 1564 switch (Instr.getOpcode()) { 1565 default: 1566 return AArch64::INSTRUCTION_LIST_END; 1567 1568 case AArch64::ADDSWrr: 1569 case AArch64::ADDSWri: 1570 case AArch64::ADDSXrr: 1571 case AArch64::ADDSXri: 1572 case AArch64::SUBSWrr: 1573 case AArch64::SUBSWri: 1574 case AArch64::SUBSXrr: 1575 case AArch64::SUBSXri: 1576 return Instr.getOpcode(); 1577 1578 case AArch64::ADDWrr: 1579 return AArch64::ADDSWrr; 1580 case AArch64::ADDWri: 1581 return AArch64::ADDSWri; 1582 case AArch64::ADDXrr: 1583 return AArch64::ADDSXrr; 1584 case AArch64::ADDXri: 1585 return AArch64::ADDSXri; 1586 case AArch64::ADCWr: 1587 return AArch64::ADCSWr; 1588 case AArch64::ADCXr: 1589 return AArch64::ADCSXr; 1590 case AArch64::SUBWrr: 1591 return AArch64::SUBSWrr; 1592 case AArch64::SUBWri: 1593 return AArch64::SUBSWri; 1594 case AArch64::SUBXrr: 1595 return AArch64::SUBSXrr; 1596 case AArch64::SUBXri: 1597 return AArch64::SUBSXri; 1598 case AArch64::SBCWr: 1599 return AArch64::SBCSWr; 1600 case AArch64::SBCXr: 1601 return AArch64::SBCSXr; 1602 case AArch64::ANDWri: 1603 return AArch64::ANDSWri; 1604 case AArch64::ANDXri: 1605 return AArch64::ANDSXri; 1606 } 1607 } 1608 1609 /// Check if AArch64::NZCV should be alive in successors of MBB. 1610 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1611 for (auto *BB : MBB->successors()) 1612 if (BB->isLiveIn(AArch64::NZCV)) 1613 return true; 1614 return false; 1615 } 1616 1617 /// \returns The condition code operand index for \p Instr if it is a branch 1618 /// or select and -1 otherwise. 1619 static int 1620 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1621 switch (Instr.getOpcode()) { 1622 default: 1623 return -1; 1624 1625 case AArch64::Bcc: { 1626 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1627 assert(Idx >= 2); 1628 return Idx - 2; 1629 } 1630 1631 case AArch64::CSINVWr: 1632 case AArch64::CSINVXr: 1633 case AArch64::CSINCWr: 1634 case AArch64::CSINCXr: 1635 case AArch64::CSELWr: 1636 case AArch64::CSELXr: 1637 case AArch64::CSNEGWr: 1638 case AArch64::CSNEGXr: 1639 case AArch64::FCSELSrrr: 1640 case AArch64::FCSELDrrr: { 1641 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1642 assert(Idx >= 1); 1643 return Idx - 1; 1644 } 1645 } 1646 } 1647 1648 /// Find a condition code used by the instruction. 1649 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1650 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1651 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1652 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1653 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1654 Instr.getOperand(CCIdx).getImm()) 1655 : AArch64CC::Invalid; 1656 } 1657 1658 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1659 assert(CC != AArch64CC::Invalid); 1660 UsedNZCV UsedFlags; 1661 switch (CC) { 1662 default: 1663 break; 1664 1665 case AArch64CC::EQ: // Z set 1666 case AArch64CC::NE: // Z clear 1667 UsedFlags.Z = true; 1668 break; 1669 1670 case AArch64CC::HI: // Z clear and C set 1671 case AArch64CC::LS: // Z set or C clear 1672 UsedFlags.Z = true; 1673 [[fallthrough]]; 1674 case AArch64CC::HS: // C set 1675 case AArch64CC::LO: // C clear 1676 UsedFlags.C = true; 1677 break; 1678 1679 case AArch64CC::MI: // N set 1680 case AArch64CC::PL: // N clear 1681 UsedFlags.N = true; 1682 break; 1683 1684 case AArch64CC::VS: // V set 1685 case AArch64CC::VC: // V clear 1686 UsedFlags.V = true; 1687 break; 1688 1689 case AArch64CC::GT: // Z clear, N and V the same 1690 case AArch64CC::LE: // Z set, N and V differ 1691 UsedFlags.Z = true; 1692 [[fallthrough]]; 1693 case AArch64CC::GE: // N and V the same 1694 case AArch64CC::LT: // N and V differ 1695 UsedFlags.N = true; 1696 UsedFlags.V = true; 1697 break; 1698 } 1699 return UsedFlags; 1700 } 1701 1702 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1703 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1704 /// \returns std::nullopt otherwise. 1705 /// 1706 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1707 std::optional<UsedNZCV> 1708 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1709 const TargetRegisterInfo &TRI, 1710 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1711 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1712 if (MI.getParent() != CmpParent) 1713 return std::nullopt; 1714 1715 if (areCFlagsAliveInSuccessors(CmpParent)) 1716 return std::nullopt; 1717 1718 UsedNZCV NZCVUsedAfterCmp; 1719 for (MachineInstr &Instr : instructionsWithoutDebug( 1720 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1721 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1722 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1723 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1724 return std::nullopt; 1725 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1726 if (CCUseInstrs) 1727 CCUseInstrs->push_back(&Instr); 1728 } 1729 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1730 break; 1731 } 1732 return NZCVUsedAfterCmp; 1733 } 1734 1735 static bool isADDSRegImm(unsigned Opcode) { 1736 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1737 } 1738 1739 static bool isSUBSRegImm(unsigned Opcode) { 1740 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1741 } 1742 1743 /// Check if CmpInstr can be substituted by MI. 1744 /// 1745 /// CmpInstr can be substituted: 1746 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1747 /// - and, MI and CmpInstr are from the same MachineBB 1748 /// - and, condition flags are not alive in successors of the CmpInstr parent 1749 /// - and, if MI opcode is the S form there must be no defs of flags between 1750 /// MI and CmpInstr 1751 /// or if MI opcode is not the S form there must be neither defs of flags 1752 /// nor uses of flags between MI and CmpInstr. 1753 /// - and, if C/V flags are not used after CmpInstr 1754 /// or if N flag is used but MI produces poison value if signed overflow 1755 /// occurs. 1756 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1757 const TargetRegisterInfo &TRI) { 1758 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1759 // that may or may not set flags. 1760 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1761 1762 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1763 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1764 return false; 1765 1766 assert((CmpInstr.getOperand(2).isImm() && 1767 CmpInstr.getOperand(2).getImm() == 0) && 1768 "Caller guarantees that CmpInstr compares with constant 0"); 1769 1770 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1771 if (!NZVCUsed || NZVCUsed->C) 1772 return false; 1773 1774 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1775 // '%vreg = add ...' or '%vreg = sub ...'. 1776 // Condition flag V is used to indicate signed overflow. 1777 // 1) MI and CmpInstr set N and V to the same value. 1778 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1779 // signed overflow occurs, so CmpInstr could still be simplified away. 1780 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1781 return false; 1782 1783 AccessKind AccessToCheck = AK_Write; 1784 if (sForm(MI) != MI.getOpcode()) 1785 AccessToCheck = AK_All; 1786 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1787 } 1788 1789 /// Substitute an instruction comparing to zero with another instruction 1790 /// which produces needed condition flags. 1791 /// 1792 /// Return true on success. 1793 bool AArch64InstrInfo::substituteCmpToZero( 1794 MachineInstr &CmpInstr, unsigned SrcReg, 1795 const MachineRegisterInfo &MRI) const { 1796 // Get the unique definition of SrcReg. 1797 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1798 if (!MI) 1799 return false; 1800 1801 const TargetRegisterInfo &TRI = getRegisterInfo(); 1802 1803 unsigned NewOpc = sForm(*MI); 1804 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1805 return false; 1806 1807 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1808 return false; 1809 1810 // Update the instruction to set NZCV. 1811 MI->setDesc(get(NewOpc)); 1812 CmpInstr.eraseFromParent(); 1813 bool succeeded = UpdateOperandRegClass(*MI); 1814 (void)succeeded; 1815 assert(succeeded && "Some operands reg class are incompatible!"); 1816 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1817 return true; 1818 } 1819 1820 /// \returns True if \p CmpInstr can be removed. 1821 /// 1822 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1823 /// codes used in \p CCUseInstrs must be inverted. 1824 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1825 int CmpValue, const TargetRegisterInfo &TRI, 1826 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1827 bool &IsInvertCC) { 1828 assert((CmpValue == 0 || CmpValue == 1) && 1829 "Only comparisons to 0 or 1 considered for removal!"); 1830 1831 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1832 unsigned MIOpc = MI.getOpcode(); 1833 if (MIOpc == AArch64::CSINCWr) { 1834 if (MI.getOperand(1).getReg() != AArch64::WZR || 1835 MI.getOperand(2).getReg() != AArch64::WZR) 1836 return false; 1837 } else if (MIOpc == AArch64::CSINCXr) { 1838 if (MI.getOperand(1).getReg() != AArch64::XZR || 1839 MI.getOperand(2).getReg() != AArch64::XZR) 1840 return false; 1841 } else { 1842 return false; 1843 } 1844 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1845 if (MICC == AArch64CC::Invalid) 1846 return false; 1847 1848 // NZCV needs to be defined 1849 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1850 return false; 1851 1852 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1853 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1854 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1855 if (CmpValue && !IsSubsRegImm) 1856 return false; 1857 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1858 return false; 1859 1860 // MI conditions allowed: eq, ne, mi, pl 1861 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1862 if (MIUsedNZCV.C || MIUsedNZCV.V) 1863 return false; 1864 1865 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1866 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1867 // Condition flags are not used in CmpInstr basic block successors and only 1868 // Z or N flags allowed to be used after CmpInstr within its basic block 1869 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1870 return false; 1871 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1872 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1873 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1874 return false; 1875 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1876 if (MIUsedNZCV.N && !CmpValue) 1877 return false; 1878 1879 // There must be no defs of flags between MI and CmpInstr 1880 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1881 return false; 1882 1883 // Condition code is inverted in the following cases: 1884 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1885 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1886 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1887 (!CmpValue && MICC == AArch64CC::NE); 1888 return true; 1889 } 1890 1891 /// Remove comparison in csinc-cmp sequence 1892 /// 1893 /// Examples: 1894 /// 1. \code 1895 /// csinc w9, wzr, wzr, ne 1896 /// cmp w9, #0 1897 /// b.eq 1898 /// \endcode 1899 /// to 1900 /// \code 1901 /// csinc w9, wzr, wzr, ne 1902 /// b.ne 1903 /// \endcode 1904 /// 1905 /// 2. \code 1906 /// csinc x2, xzr, xzr, mi 1907 /// cmp x2, #1 1908 /// b.pl 1909 /// \endcode 1910 /// to 1911 /// \code 1912 /// csinc x2, xzr, xzr, mi 1913 /// b.pl 1914 /// \endcode 1915 /// 1916 /// \param CmpInstr comparison instruction 1917 /// \return True when comparison removed 1918 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1919 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1920 const MachineRegisterInfo &MRI) const { 1921 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1922 if (!MI) 1923 return false; 1924 const TargetRegisterInfo &TRI = getRegisterInfo(); 1925 SmallVector<MachineInstr *, 4> CCUseInstrs; 1926 bool IsInvertCC = false; 1927 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1928 IsInvertCC)) 1929 return false; 1930 // Make transformation 1931 CmpInstr.eraseFromParent(); 1932 if (IsInvertCC) { 1933 // Invert condition codes in CmpInstr CC users 1934 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1935 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1936 assert(Idx >= 0 && "Unexpected instruction using CC."); 1937 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1938 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1939 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1940 CCOperand.setImm(CCUse); 1941 } 1942 } 1943 return true; 1944 } 1945 1946 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1947 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1948 MI.getOpcode() != AArch64::CATCHRET) 1949 return false; 1950 1951 MachineBasicBlock &MBB = *MI.getParent(); 1952 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1953 auto TRI = Subtarget.getRegisterInfo(); 1954 DebugLoc DL = MI.getDebugLoc(); 1955 1956 if (MI.getOpcode() == AArch64::CATCHRET) { 1957 // Skip to the first instruction before the epilog. 1958 const TargetInstrInfo *TII = 1959 MBB.getParent()->getSubtarget().getInstrInfo(); 1960 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1961 auto MBBI = MachineBasicBlock::iterator(MI); 1962 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1963 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1964 FirstEpilogSEH != MBB.begin()) 1965 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1966 if (FirstEpilogSEH != MBB.begin()) 1967 FirstEpilogSEH = std::next(FirstEpilogSEH); 1968 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1969 .addReg(AArch64::X0, RegState::Define) 1970 .addMBB(TargetMBB); 1971 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1972 .addReg(AArch64::X0, RegState::Define) 1973 .addReg(AArch64::X0) 1974 .addMBB(TargetMBB) 1975 .addImm(0); 1976 return true; 1977 } 1978 1979 Register Reg = MI.getOperand(0).getReg(); 1980 Module &M = *MBB.getParent()->getFunction().getParent(); 1981 if (M.getStackProtectorGuard() == "sysreg") { 1982 const AArch64SysReg::SysReg *SrcReg = 1983 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1984 if (!SrcReg) 1985 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1986 1987 // mrs xN, sysreg 1988 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1989 .addDef(Reg, RegState::Renamable) 1990 .addImm(SrcReg->Encoding); 1991 int Offset = M.getStackProtectorGuardOffset(); 1992 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1993 // ldr xN, [xN, #offset] 1994 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1995 .addDef(Reg) 1996 .addUse(Reg, RegState::Kill) 1997 .addImm(Offset / 8); 1998 } else if (Offset >= -256 && Offset <= 255) { 1999 // ldur xN, [xN, #offset] 2000 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 2001 .addDef(Reg) 2002 .addUse(Reg, RegState::Kill) 2003 .addImm(Offset); 2004 } else if (Offset >= -4095 && Offset <= 4095) { 2005 if (Offset > 0) { 2006 // add xN, xN, #offset 2007 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 2008 .addDef(Reg) 2009 .addUse(Reg, RegState::Kill) 2010 .addImm(Offset) 2011 .addImm(0); 2012 } else { 2013 // sub xN, xN, #offset 2014 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 2015 .addDef(Reg) 2016 .addUse(Reg, RegState::Kill) 2017 .addImm(-Offset) 2018 .addImm(0); 2019 } 2020 // ldr xN, [xN] 2021 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 2022 .addDef(Reg) 2023 .addUse(Reg, RegState::Kill) 2024 .addImm(0); 2025 } else { 2026 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 2027 // than 23760. 2028 // It might be nice to use AArch64::MOVi32imm here, which would get 2029 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 2030 // contains the MRS result. findScratchNonCalleeSaveRegister() in 2031 // AArch64FrameLowering might help us find such a scratch register 2032 // though. If we failed to find a scratch register, we could emit a 2033 // stream of add instructions to build up the immediate. Or, we could try 2034 // to insert a AArch64::MOVi32imm before register allocation so that we 2035 // didn't need to scavenge for a scratch register. 2036 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 2037 } 2038 MBB.erase(MI); 2039 return true; 2040 } 2041 2042 const GlobalValue *GV = 2043 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 2044 const TargetMachine &TM = MBB.getParent()->getTarget(); 2045 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 2046 const unsigned char MO_NC = AArch64II::MO_NC; 2047 2048 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2049 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 2050 .addGlobalAddress(GV, 0, OpFlags); 2051 if (Subtarget.isTargetILP32()) { 2052 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2053 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2054 .addDef(Reg32, RegState::Dead) 2055 .addUse(Reg, RegState::Kill) 2056 .addImm(0) 2057 .addMemOperand(*MI.memoperands_begin()) 2058 .addDef(Reg, RegState::Implicit); 2059 } else { 2060 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2061 .addReg(Reg, RegState::Kill) 2062 .addImm(0) 2063 .addMemOperand(*MI.memoperands_begin()); 2064 } 2065 } else if (TM.getCodeModel() == CodeModel::Large) { 2066 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2067 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2068 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2069 .addImm(0); 2070 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2071 .addReg(Reg, RegState::Kill) 2072 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2073 .addImm(16); 2074 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2075 .addReg(Reg, RegState::Kill) 2076 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2077 .addImm(32); 2078 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2079 .addReg(Reg, RegState::Kill) 2080 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2081 .addImm(48); 2082 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2083 .addReg(Reg, RegState::Kill) 2084 .addImm(0) 2085 .addMemOperand(*MI.memoperands_begin()); 2086 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2087 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2088 .addGlobalAddress(GV, 0, OpFlags); 2089 } else { 2090 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2091 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2092 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2093 if (Subtarget.isTargetILP32()) { 2094 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2095 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2096 .addDef(Reg32, RegState::Dead) 2097 .addUse(Reg, RegState::Kill) 2098 .addGlobalAddress(GV, 0, LoFlags) 2099 .addMemOperand(*MI.memoperands_begin()) 2100 .addDef(Reg, RegState::Implicit); 2101 } else { 2102 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2103 .addReg(Reg, RegState::Kill) 2104 .addGlobalAddress(GV, 0, LoFlags) 2105 .addMemOperand(*MI.memoperands_begin()); 2106 } 2107 } 2108 2109 MBB.erase(MI); 2110 2111 return true; 2112 } 2113 2114 // Return true if this instruction simply sets its single destination register 2115 // to zero. This is equivalent to a register rename of the zero-register. 2116 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2117 switch (MI.getOpcode()) { 2118 default: 2119 break; 2120 case AArch64::MOVZWi: 2121 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2122 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2123 assert(MI.getDesc().getNumOperands() == 3 && 2124 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2125 return true; 2126 } 2127 break; 2128 case AArch64::ANDWri: // and Rd, Rzr, #imm 2129 return MI.getOperand(1).getReg() == AArch64::WZR; 2130 case AArch64::ANDXri: 2131 return MI.getOperand(1).getReg() == AArch64::XZR; 2132 case TargetOpcode::COPY: 2133 return MI.getOperand(1).getReg() == AArch64::WZR; 2134 } 2135 return false; 2136 } 2137 2138 // Return true if this instruction simply renames a general register without 2139 // modifying bits. 2140 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2141 switch (MI.getOpcode()) { 2142 default: 2143 break; 2144 case TargetOpcode::COPY: { 2145 // GPR32 copies will by lowered to ORRXrs 2146 Register DstReg = MI.getOperand(0).getReg(); 2147 return (AArch64::GPR32RegClass.contains(DstReg) || 2148 AArch64::GPR64RegClass.contains(DstReg)); 2149 } 2150 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2151 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2152 assert(MI.getDesc().getNumOperands() == 4 && 2153 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2154 return true; 2155 } 2156 break; 2157 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2158 if (MI.getOperand(2).getImm() == 0) { 2159 assert(MI.getDesc().getNumOperands() == 4 && 2160 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2161 return true; 2162 } 2163 break; 2164 } 2165 return false; 2166 } 2167 2168 // Return true if this instruction simply renames a general register without 2169 // modifying bits. 2170 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2171 switch (MI.getOpcode()) { 2172 default: 2173 break; 2174 case TargetOpcode::COPY: { 2175 Register DstReg = MI.getOperand(0).getReg(); 2176 return AArch64::FPR128RegClass.contains(DstReg); 2177 } 2178 case AArch64::ORRv16i8: 2179 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2180 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2181 "invalid ORRv16i8 operands"); 2182 return true; 2183 } 2184 break; 2185 } 2186 return false; 2187 } 2188 2189 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2190 int &FrameIndex) const { 2191 switch (MI.getOpcode()) { 2192 default: 2193 break; 2194 case AArch64::LDRWui: 2195 case AArch64::LDRXui: 2196 case AArch64::LDRBui: 2197 case AArch64::LDRHui: 2198 case AArch64::LDRSui: 2199 case AArch64::LDRDui: 2200 case AArch64::LDRQui: 2201 case AArch64::LDR_PXI: 2202 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2203 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2204 FrameIndex = MI.getOperand(1).getIndex(); 2205 return MI.getOperand(0).getReg(); 2206 } 2207 break; 2208 } 2209 2210 return 0; 2211 } 2212 2213 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2214 int &FrameIndex) const { 2215 switch (MI.getOpcode()) { 2216 default: 2217 break; 2218 case AArch64::STRWui: 2219 case AArch64::STRXui: 2220 case AArch64::STRBui: 2221 case AArch64::STRHui: 2222 case AArch64::STRSui: 2223 case AArch64::STRDui: 2224 case AArch64::STRQui: 2225 case AArch64::STR_PXI: 2226 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2227 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2228 FrameIndex = MI.getOperand(1).getIndex(); 2229 return MI.getOperand(0).getReg(); 2230 } 2231 break; 2232 } 2233 return 0; 2234 } 2235 2236 /// Check all MachineMemOperands for a hint to suppress pairing. 2237 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2238 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2239 return MMO->getFlags() & MOSuppressPair; 2240 }); 2241 } 2242 2243 /// Set a flag on the first MachineMemOperand to suppress pairing. 2244 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2245 if (MI.memoperands_empty()) 2246 return; 2247 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2248 } 2249 2250 /// Check all MachineMemOperands for a hint that the load/store is strided. 2251 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2252 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2253 return MMO->getFlags() & MOStridedAccess; 2254 }); 2255 } 2256 2257 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2258 switch (Opc) { 2259 default: 2260 return false; 2261 case AArch64::STURSi: 2262 case AArch64::STRSpre: 2263 case AArch64::STURDi: 2264 case AArch64::STRDpre: 2265 case AArch64::STURQi: 2266 case AArch64::STRQpre: 2267 case AArch64::STURBBi: 2268 case AArch64::STURHHi: 2269 case AArch64::STURWi: 2270 case AArch64::STRWpre: 2271 case AArch64::STURXi: 2272 case AArch64::STRXpre: 2273 case AArch64::LDURSi: 2274 case AArch64::LDRSpre: 2275 case AArch64::LDURDi: 2276 case AArch64::LDRDpre: 2277 case AArch64::LDURQi: 2278 case AArch64::LDRQpre: 2279 case AArch64::LDURWi: 2280 case AArch64::LDRWpre: 2281 case AArch64::LDURXi: 2282 case AArch64::LDRXpre: 2283 case AArch64::LDRSWpre: 2284 case AArch64::LDURSWi: 2285 case AArch64::LDURHHi: 2286 case AArch64::LDURBBi: 2287 case AArch64::LDURSBWi: 2288 case AArch64::LDURSHWi: 2289 return true; 2290 } 2291 } 2292 2293 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2294 switch (Opc) { 2295 default: return {}; 2296 case AArch64::PRFMui: return AArch64::PRFUMi; 2297 case AArch64::LDRXui: return AArch64::LDURXi; 2298 case AArch64::LDRWui: return AArch64::LDURWi; 2299 case AArch64::LDRBui: return AArch64::LDURBi; 2300 case AArch64::LDRHui: return AArch64::LDURHi; 2301 case AArch64::LDRSui: return AArch64::LDURSi; 2302 case AArch64::LDRDui: return AArch64::LDURDi; 2303 case AArch64::LDRQui: return AArch64::LDURQi; 2304 case AArch64::LDRBBui: return AArch64::LDURBBi; 2305 case AArch64::LDRHHui: return AArch64::LDURHHi; 2306 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2307 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2308 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2309 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2310 case AArch64::LDRSWui: return AArch64::LDURSWi; 2311 case AArch64::STRXui: return AArch64::STURXi; 2312 case AArch64::STRWui: return AArch64::STURWi; 2313 case AArch64::STRBui: return AArch64::STURBi; 2314 case AArch64::STRHui: return AArch64::STURHi; 2315 case AArch64::STRSui: return AArch64::STURSi; 2316 case AArch64::STRDui: return AArch64::STURDi; 2317 case AArch64::STRQui: return AArch64::STURQi; 2318 case AArch64::STRBBui: return AArch64::STURBBi; 2319 case AArch64::STRHHui: return AArch64::STURHHi; 2320 } 2321 } 2322 2323 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2324 switch (Opc) { 2325 default: 2326 return 2; 2327 case AArch64::LDPXi: 2328 case AArch64::LDPDi: 2329 case AArch64::STPXi: 2330 case AArch64::STPDi: 2331 case AArch64::LDNPXi: 2332 case AArch64::LDNPDi: 2333 case AArch64::STNPXi: 2334 case AArch64::STNPDi: 2335 case AArch64::LDPQi: 2336 case AArch64::STPQi: 2337 case AArch64::LDNPQi: 2338 case AArch64::STNPQi: 2339 case AArch64::LDPWi: 2340 case AArch64::LDPSi: 2341 case AArch64::STPWi: 2342 case AArch64::STPSi: 2343 case AArch64::LDNPWi: 2344 case AArch64::LDNPSi: 2345 case AArch64::STNPWi: 2346 case AArch64::STNPSi: 2347 case AArch64::LDG: 2348 case AArch64::STGPi: 2349 2350 case AArch64::LD1B_IMM: 2351 case AArch64::LD1B_H_IMM: 2352 case AArch64::LD1B_S_IMM: 2353 case AArch64::LD1B_D_IMM: 2354 case AArch64::LD1SB_H_IMM: 2355 case AArch64::LD1SB_S_IMM: 2356 case AArch64::LD1SB_D_IMM: 2357 case AArch64::LD1H_IMM: 2358 case AArch64::LD1H_S_IMM: 2359 case AArch64::LD1H_D_IMM: 2360 case AArch64::LD1SH_S_IMM: 2361 case AArch64::LD1SH_D_IMM: 2362 case AArch64::LD1W_IMM: 2363 case AArch64::LD1W_D_IMM: 2364 case AArch64::LD1SW_D_IMM: 2365 case AArch64::LD1D_IMM: 2366 2367 case AArch64::LD2B_IMM: 2368 case AArch64::LD2H_IMM: 2369 case AArch64::LD2W_IMM: 2370 case AArch64::LD2D_IMM: 2371 case AArch64::LD3B_IMM: 2372 case AArch64::LD3H_IMM: 2373 case AArch64::LD3W_IMM: 2374 case AArch64::LD3D_IMM: 2375 case AArch64::LD4B_IMM: 2376 case AArch64::LD4H_IMM: 2377 case AArch64::LD4W_IMM: 2378 case AArch64::LD4D_IMM: 2379 2380 case AArch64::ST1B_IMM: 2381 case AArch64::ST1B_H_IMM: 2382 case AArch64::ST1B_S_IMM: 2383 case AArch64::ST1B_D_IMM: 2384 case AArch64::ST1H_IMM: 2385 case AArch64::ST1H_S_IMM: 2386 case AArch64::ST1H_D_IMM: 2387 case AArch64::ST1W_IMM: 2388 case AArch64::ST1W_D_IMM: 2389 case AArch64::ST1D_IMM: 2390 2391 case AArch64::ST2B_IMM: 2392 case AArch64::ST2H_IMM: 2393 case AArch64::ST2W_IMM: 2394 case AArch64::ST2D_IMM: 2395 case AArch64::ST3B_IMM: 2396 case AArch64::ST3H_IMM: 2397 case AArch64::ST3W_IMM: 2398 case AArch64::ST3D_IMM: 2399 case AArch64::ST4B_IMM: 2400 case AArch64::ST4H_IMM: 2401 case AArch64::ST4W_IMM: 2402 case AArch64::ST4D_IMM: 2403 2404 case AArch64::LD1RB_IMM: 2405 case AArch64::LD1RB_H_IMM: 2406 case AArch64::LD1RB_S_IMM: 2407 case AArch64::LD1RB_D_IMM: 2408 case AArch64::LD1RSB_H_IMM: 2409 case AArch64::LD1RSB_S_IMM: 2410 case AArch64::LD1RSB_D_IMM: 2411 case AArch64::LD1RH_IMM: 2412 case AArch64::LD1RH_S_IMM: 2413 case AArch64::LD1RH_D_IMM: 2414 case AArch64::LD1RSH_S_IMM: 2415 case AArch64::LD1RSH_D_IMM: 2416 case AArch64::LD1RW_IMM: 2417 case AArch64::LD1RW_D_IMM: 2418 case AArch64::LD1RSW_IMM: 2419 case AArch64::LD1RD_IMM: 2420 2421 case AArch64::LDNT1B_ZRI: 2422 case AArch64::LDNT1H_ZRI: 2423 case AArch64::LDNT1W_ZRI: 2424 case AArch64::LDNT1D_ZRI: 2425 case AArch64::STNT1B_ZRI: 2426 case AArch64::STNT1H_ZRI: 2427 case AArch64::STNT1W_ZRI: 2428 case AArch64::STNT1D_ZRI: 2429 2430 case AArch64::LDNF1B_IMM: 2431 case AArch64::LDNF1B_H_IMM: 2432 case AArch64::LDNF1B_S_IMM: 2433 case AArch64::LDNF1B_D_IMM: 2434 case AArch64::LDNF1SB_H_IMM: 2435 case AArch64::LDNF1SB_S_IMM: 2436 case AArch64::LDNF1SB_D_IMM: 2437 case AArch64::LDNF1H_IMM: 2438 case AArch64::LDNF1H_S_IMM: 2439 case AArch64::LDNF1H_D_IMM: 2440 case AArch64::LDNF1SH_S_IMM: 2441 case AArch64::LDNF1SH_D_IMM: 2442 case AArch64::LDNF1W_IMM: 2443 case AArch64::LDNF1W_D_IMM: 2444 case AArch64::LDNF1SW_D_IMM: 2445 case AArch64::LDNF1D_IMM: 2446 return 3; 2447 case AArch64::ADDG: 2448 case AArch64::STGi: 2449 case AArch64::LDR_PXI: 2450 case AArch64::STR_PXI: 2451 return 2; 2452 } 2453 } 2454 2455 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2456 switch (MI.getOpcode()) { 2457 default: 2458 return false; 2459 // Scaled instructions. 2460 case AArch64::STRSui: 2461 case AArch64::STRDui: 2462 case AArch64::STRQui: 2463 case AArch64::STRXui: 2464 case AArch64::STRWui: 2465 case AArch64::LDRSui: 2466 case AArch64::LDRDui: 2467 case AArch64::LDRQui: 2468 case AArch64::LDRXui: 2469 case AArch64::LDRWui: 2470 case AArch64::LDRSWui: 2471 // Unscaled instructions. 2472 case AArch64::STURSi: 2473 case AArch64::STRSpre: 2474 case AArch64::STURDi: 2475 case AArch64::STRDpre: 2476 case AArch64::STURQi: 2477 case AArch64::STRQpre: 2478 case AArch64::STURWi: 2479 case AArch64::STRWpre: 2480 case AArch64::STURXi: 2481 case AArch64::STRXpre: 2482 case AArch64::LDURSi: 2483 case AArch64::LDRSpre: 2484 case AArch64::LDURDi: 2485 case AArch64::LDRDpre: 2486 case AArch64::LDURQi: 2487 case AArch64::LDRQpre: 2488 case AArch64::LDURWi: 2489 case AArch64::LDRWpre: 2490 case AArch64::LDURXi: 2491 case AArch64::LDRXpre: 2492 case AArch64::LDURSWi: 2493 case AArch64::LDRSWpre: 2494 return true; 2495 } 2496 } 2497 2498 bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) { 2499 switch (MI.getOpcode()) { 2500 default: 2501 assert((!MI.isCall() || !MI.isReturn()) && 2502 "Unexpected instruction - was a new tail call opcode introduced?"); 2503 return false; 2504 case AArch64::TCRETURNdi: 2505 case AArch64::TCRETURNri: 2506 case AArch64::TCRETURNriBTI: 2507 case AArch64::TCRETURNriALL: 2508 return true; 2509 } 2510 } 2511 2512 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2513 switch (Opc) { 2514 default: 2515 llvm_unreachable("Opcode has no flag setting equivalent!"); 2516 // 32-bit cases: 2517 case AArch64::ADDWri: 2518 return AArch64::ADDSWri; 2519 case AArch64::ADDWrr: 2520 return AArch64::ADDSWrr; 2521 case AArch64::ADDWrs: 2522 return AArch64::ADDSWrs; 2523 case AArch64::ADDWrx: 2524 return AArch64::ADDSWrx; 2525 case AArch64::ANDWri: 2526 return AArch64::ANDSWri; 2527 case AArch64::ANDWrr: 2528 return AArch64::ANDSWrr; 2529 case AArch64::ANDWrs: 2530 return AArch64::ANDSWrs; 2531 case AArch64::BICWrr: 2532 return AArch64::BICSWrr; 2533 case AArch64::BICWrs: 2534 return AArch64::BICSWrs; 2535 case AArch64::SUBWri: 2536 return AArch64::SUBSWri; 2537 case AArch64::SUBWrr: 2538 return AArch64::SUBSWrr; 2539 case AArch64::SUBWrs: 2540 return AArch64::SUBSWrs; 2541 case AArch64::SUBWrx: 2542 return AArch64::SUBSWrx; 2543 // 64-bit cases: 2544 case AArch64::ADDXri: 2545 return AArch64::ADDSXri; 2546 case AArch64::ADDXrr: 2547 return AArch64::ADDSXrr; 2548 case AArch64::ADDXrs: 2549 return AArch64::ADDSXrs; 2550 case AArch64::ADDXrx: 2551 return AArch64::ADDSXrx; 2552 case AArch64::ANDXri: 2553 return AArch64::ANDSXri; 2554 case AArch64::ANDXrr: 2555 return AArch64::ANDSXrr; 2556 case AArch64::ANDXrs: 2557 return AArch64::ANDSXrs; 2558 case AArch64::BICXrr: 2559 return AArch64::BICSXrr; 2560 case AArch64::BICXrs: 2561 return AArch64::BICSXrs; 2562 case AArch64::SUBXri: 2563 return AArch64::SUBSXri; 2564 case AArch64::SUBXrr: 2565 return AArch64::SUBSXrr; 2566 case AArch64::SUBXrs: 2567 return AArch64::SUBSXrs; 2568 case AArch64::SUBXrx: 2569 return AArch64::SUBSXrx; 2570 // SVE instructions: 2571 case AArch64::AND_PPzPP: 2572 return AArch64::ANDS_PPzPP; 2573 case AArch64::BIC_PPzPP: 2574 return AArch64::BICS_PPzPP; 2575 case AArch64::EOR_PPzPP: 2576 return AArch64::EORS_PPzPP; 2577 case AArch64::NAND_PPzPP: 2578 return AArch64::NANDS_PPzPP; 2579 case AArch64::NOR_PPzPP: 2580 return AArch64::NORS_PPzPP; 2581 case AArch64::ORN_PPzPP: 2582 return AArch64::ORNS_PPzPP; 2583 case AArch64::ORR_PPzPP: 2584 return AArch64::ORRS_PPzPP; 2585 case AArch64::BRKA_PPzP: 2586 return AArch64::BRKAS_PPzP; 2587 case AArch64::BRKPA_PPzPP: 2588 return AArch64::BRKPAS_PPzPP; 2589 case AArch64::BRKB_PPzP: 2590 return AArch64::BRKBS_PPzP; 2591 case AArch64::BRKPB_PPzPP: 2592 return AArch64::BRKPBS_PPzPP; 2593 case AArch64::BRKN_PPzP: 2594 return AArch64::BRKNS_PPzP; 2595 case AArch64::RDFFR_PPz: 2596 return AArch64::RDFFRS_PPz; 2597 case AArch64::PTRUE_B: 2598 return AArch64::PTRUES_B; 2599 } 2600 } 2601 2602 // Is this a candidate for ld/st merging or pairing? For example, we don't 2603 // touch volatiles or load/stores that have a hint to avoid pair formation. 2604 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2605 2606 bool IsPreLdSt = isPreLdSt(MI); 2607 2608 // If this is a volatile load/store, don't mess with it. 2609 if (MI.hasOrderedMemoryRef()) 2610 return false; 2611 2612 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2613 // For Pre-inc LD/ST, the operand is shifted by one. 2614 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2615 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2616 "Expected a reg or frame index operand."); 2617 2618 // For Pre-indexed addressing quadword instructions, the third operand is the 2619 // immediate value. 2620 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2621 2622 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2623 return false; 2624 2625 // Can't merge/pair if the instruction modifies the base register. 2626 // e.g., ldr x0, [x0] 2627 // This case will never occur with an FI base. 2628 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or 2629 // STR<S,D,Q,W,X>pre, it can be merged. 2630 // For example: 2631 // ldr q0, [x11, #32]! 2632 // ldr q1, [x11, #16] 2633 // to 2634 // ldp q0, q1, [x11, #32]! 2635 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2636 Register BaseReg = MI.getOperand(1).getReg(); 2637 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2638 if (MI.modifiesRegister(BaseReg, TRI)) 2639 return false; 2640 } 2641 2642 // Check if this load/store has a hint to avoid pair formation. 2643 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2644 if (isLdStPairSuppressed(MI)) 2645 return false; 2646 2647 // Do not pair any callee-save store/reload instructions in the 2648 // prologue/epilogue if the CFI information encoded the operations as separate 2649 // instructions, as that will cause the size of the actual prologue to mismatch 2650 // with the prologue size recorded in the Windows CFI. 2651 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2652 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2653 MI.getMF()->getFunction().needsUnwindTableEntry(); 2654 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2655 MI.getFlag(MachineInstr::FrameDestroy))) 2656 return false; 2657 2658 // On some CPUs quad load/store pairs are slower than two single load/stores. 2659 if (Subtarget.isPaired128Slow()) { 2660 switch (MI.getOpcode()) { 2661 default: 2662 break; 2663 case AArch64::LDURQi: 2664 case AArch64::STURQi: 2665 case AArch64::LDRQui: 2666 case AArch64::STRQui: 2667 return false; 2668 } 2669 } 2670 2671 return true; 2672 } 2673 2674 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2675 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2676 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2677 const TargetRegisterInfo *TRI) const { 2678 if (!LdSt.mayLoadOrStore()) 2679 return false; 2680 2681 const MachineOperand *BaseOp; 2682 TypeSize WidthN(0, false); 2683 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2684 WidthN, TRI)) 2685 return false; 2686 // The maximum vscale is 16 under AArch64, return the maximal extent for the 2687 // vector. 2688 Width = WidthN.isScalable() 2689 ? WidthN.getKnownMinValue() * AArch64::SVEMaxBitsPerVector / 2690 AArch64::SVEBitsPerBlock 2691 : WidthN.getKnownMinValue(); 2692 BaseOps.push_back(BaseOp); 2693 return true; 2694 } 2695 2696 std::optional<ExtAddrMode> 2697 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2698 const TargetRegisterInfo *TRI) const { 2699 const MachineOperand *Base; // Filled with the base operand of MI. 2700 int64_t Offset; // Filled with the offset of MI. 2701 bool OffsetIsScalable; 2702 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2703 return std::nullopt; 2704 2705 if (!Base->isReg()) 2706 return std::nullopt; 2707 ExtAddrMode AM; 2708 AM.BaseReg = Base->getReg(); 2709 AM.Displacement = Offset; 2710 AM.ScaledReg = 0; 2711 AM.Scale = 0; 2712 return AM; 2713 } 2714 2715 bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, 2716 Register Reg, 2717 const MachineInstr &AddrI, 2718 ExtAddrMode &AM) const { 2719 // Filter out instructions into which we cannot fold. 2720 unsigned NumBytes; 2721 int64_t OffsetScale = 1; 2722 switch (MemI.getOpcode()) { 2723 default: 2724 return false; 2725 2726 case AArch64::LDURQi: 2727 case AArch64::STURQi: 2728 NumBytes = 16; 2729 break; 2730 2731 case AArch64::LDURDi: 2732 case AArch64::STURDi: 2733 case AArch64::LDURXi: 2734 case AArch64::STURXi: 2735 NumBytes = 8; 2736 break; 2737 2738 case AArch64::LDURWi: 2739 case AArch64::LDURSWi: 2740 case AArch64::STURWi: 2741 NumBytes = 4; 2742 break; 2743 2744 case AArch64::LDURHi: 2745 case AArch64::STURHi: 2746 case AArch64::LDURHHi: 2747 case AArch64::STURHHi: 2748 case AArch64::LDURSHXi: 2749 case AArch64::LDURSHWi: 2750 NumBytes = 2; 2751 break; 2752 2753 case AArch64::LDRBroX: 2754 case AArch64::LDRBBroX: 2755 case AArch64::LDRSBXroX: 2756 case AArch64::LDRSBWroX: 2757 case AArch64::STRBroX: 2758 case AArch64::STRBBroX: 2759 case AArch64::LDURBi: 2760 case AArch64::LDURBBi: 2761 case AArch64::LDURSBXi: 2762 case AArch64::LDURSBWi: 2763 case AArch64::STURBi: 2764 case AArch64::STURBBi: 2765 case AArch64::LDRBui: 2766 case AArch64::LDRBBui: 2767 case AArch64::LDRSBXui: 2768 case AArch64::LDRSBWui: 2769 case AArch64::STRBui: 2770 case AArch64::STRBBui: 2771 NumBytes = 1; 2772 break; 2773 2774 case AArch64::LDRQroX: 2775 case AArch64::STRQroX: 2776 case AArch64::LDRQui: 2777 case AArch64::STRQui: 2778 NumBytes = 16; 2779 OffsetScale = 16; 2780 break; 2781 2782 case AArch64::LDRDroX: 2783 case AArch64::STRDroX: 2784 case AArch64::LDRXroX: 2785 case AArch64::STRXroX: 2786 case AArch64::LDRDui: 2787 case AArch64::STRDui: 2788 case AArch64::LDRXui: 2789 case AArch64::STRXui: 2790 NumBytes = 8; 2791 OffsetScale = 8; 2792 break; 2793 2794 case AArch64::LDRWroX: 2795 case AArch64::LDRSWroX: 2796 case AArch64::STRWroX: 2797 case AArch64::LDRWui: 2798 case AArch64::LDRSWui: 2799 case AArch64::STRWui: 2800 NumBytes = 4; 2801 OffsetScale = 4; 2802 break; 2803 2804 case AArch64::LDRHroX: 2805 case AArch64::STRHroX: 2806 case AArch64::LDRHHroX: 2807 case AArch64::STRHHroX: 2808 case AArch64::LDRSHXroX: 2809 case AArch64::LDRSHWroX: 2810 case AArch64::LDRHui: 2811 case AArch64::STRHui: 2812 case AArch64::LDRHHui: 2813 case AArch64::STRHHui: 2814 case AArch64::LDRSHXui: 2815 case AArch64::LDRSHWui: 2816 NumBytes = 2; 2817 OffsetScale = 2; 2818 break; 2819 } 2820 2821 // Check the fold operand is not the loaded/stored value. 2822 const MachineOperand &BaseRegOp = MemI.getOperand(0); 2823 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg) 2824 return false; 2825 2826 // Handle memory instructions with a [Reg, Reg] addressing mode. 2827 if (MemI.getOperand(2).isReg()) { 2828 // Bail if the addressing mode already includes extension of the offset 2829 // register. 2830 if (MemI.getOperand(3).getImm()) 2831 return false; 2832 2833 // Check if we actually have a scaled offset. 2834 if (MemI.getOperand(4).getImm() == 0) 2835 OffsetScale = 1; 2836 2837 // If the address instructions is folded into the base register, then the 2838 // addressing mode must not have a scale. Then we can swap the base and the 2839 // scaled registers. 2840 if (MemI.getOperand(1).getReg() == Reg && OffsetScale != 1) 2841 return false; 2842 2843 switch (AddrI.getOpcode()) { 2844 default: 2845 return false; 2846 2847 case AArch64::SBFMXri: 2848 // sxtw Xa, Wm 2849 // ldr Xd, [Xn, Xa, lsl #N] 2850 // -> 2851 // ldr Xd, [Xn, Wm, sxtw #N] 2852 if (AddrI.getOperand(2).getImm() != 0 || 2853 AddrI.getOperand(3).getImm() != 31) 2854 return false; 2855 2856 AM.BaseReg = MemI.getOperand(1).getReg(); 2857 if (AM.BaseReg == Reg) 2858 AM.BaseReg = MemI.getOperand(2).getReg(); 2859 AM.ScaledReg = AddrI.getOperand(1).getReg(); 2860 AM.Scale = OffsetScale; 2861 AM.Displacement = 0; 2862 AM.Form = ExtAddrMode::Formula::SExtScaledReg; 2863 return true; 2864 2865 case TargetOpcode::SUBREG_TO_REG: { 2866 // mov Wa, Wm 2867 // ldr Xd, [Xn, Xa, lsl #N] 2868 // -> 2869 // ldr Xd, [Xn, Wm, uxtw #N] 2870 2871 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG. 2872 if (AddrI.getOperand(1).getImm() != 0 || 2873 AddrI.getOperand(3).getImm() != AArch64::sub_32) 2874 return false; 2875 2876 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo(); 2877 Register OffsetReg = AddrI.getOperand(2).getReg(); 2878 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(OffsetReg)) 2879 return false; 2880 2881 const MachineInstr &DefMI = *MRI.getVRegDef(OffsetReg); 2882 if (DefMI.getOpcode() != AArch64::ORRWrs || 2883 DefMI.getOperand(1).getReg() != AArch64::WZR || 2884 DefMI.getOperand(3).getImm() != 0) 2885 return false; 2886 2887 AM.BaseReg = MemI.getOperand(1).getReg(); 2888 if (AM.BaseReg == Reg) 2889 AM.BaseReg = MemI.getOperand(2).getReg(); 2890 AM.ScaledReg = DefMI.getOperand(2).getReg(); 2891 AM.Scale = OffsetScale; 2892 AM.Displacement = 0; 2893 AM.Form = ExtAddrMode::Formula::ZExtScaledReg; 2894 return true; 2895 } 2896 } 2897 } 2898 2899 // Handle memory instructions with a [Reg, #Imm] addressing mode. 2900 2901 // Check we are not breaking a potential conversion to an LDP. 2902 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset, 2903 int64_t NewOffset) -> bool { 2904 int64_t MinOffset, MaxOffset; 2905 switch (NumBytes) { 2906 default: 2907 return true; 2908 case 4: 2909 MinOffset = -256; 2910 MaxOffset = 252; 2911 break; 2912 case 8: 2913 MinOffset = -512; 2914 MaxOffset = 504; 2915 break; 2916 case 16: 2917 MinOffset = -1024; 2918 MaxOffset = 1008; 2919 break; 2920 } 2921 return OldOffset < MinOffset || OldOffset > MaxOffset || 2922 (NewOffset >= MinOffset && NewOffset <= MaxOffset); 2923 }; 2924 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool { 2925 int64_t OldOffset = MemI.getOperand(2).getImm() * OffsetScale; 2926 int64_t NewOffset = OldOffset + Disp; 2927 if (!isLegalAddressingMode(NumBytes, NewOffset, /* Scale */ 0)) 2928 return false; 2929 // If the old offset would fit into an LDP, but the new offset wouldn't, 2930 // bail out. 2931 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset)) 2932 return false; 2933 AM.BaseReg = AddrI.getOperand(1).getReg(); 2934 AM.ScaledReg = 0; 2935 AM.Scale = 0; 2936 AM.Displacement = NewOffset; 2937 AM.Form = ExtAddrMode::Formula::Basic; 2938 return true; 2939 }; 2940 2941 auto canFoldAddRegIntoAddrMode = 2942 [&](int64_t Scale, 2943 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool { 2944 if (MemI.getOperand(2).getImm() != 0) 2945 return false; 2946 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale)) 2947 return false; 2948 AM.BaseReg = AddrI.getOperand(1).getReg(); 2949 AM.ScaledReg = AddrI.getOperand(2).getReg(); 2950 AM.Scale = Scale; 2951 AM.Displacement = 0; 2952 AM.Form = Form; 2953 return true; 2954 }; 2955 2956 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) { 2957 unsigned Opcode = MemI.getOpcode(); 2958 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) && 2959 Subtarget.isSTRQroSlow(); 2960 }; 2961 2962 int64_t Disp = 0; 2963 const bool OptSize = MemI.getMF()->getFunction().hasOptSize(); 2964 switch (AddrI.getOpcode()) { 2965 default: 2966 return false; 2967 2968 case AArch64::ADDXri: 2969 // add Xa, Xn, #N 2970 // ldr Xd, [Xa, #M] 2971 // -> 2972 // ldr Xd, [Xn, #N'+M] 2973 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 2974 return canFoldAddSubImmIntoAddrMode(Disp); 2975 2976 case AArch64::SUBXri: 2977 // sub Xa, Xn, #N 2978 // ldr Xd, [Xa, #M] 2979 // -> 2980 // ldr Xd, [Xn, #N'+M] 2981 Disp = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); 2982 return canFoldAddSubImmIntoAddrMode(-Disp); 2983 2984 case AArch64::ADDXrs: { 2985 // add Xa, Xn, Xm, lsl #N 2986 // ldr Xd, [Xa] 2987 // -> 2988 // ldr Xd, [Xn, Xm, lsl #N] 2989 2990 // Don't fold the add if the result would be slower, unless optimising for 2991 // size. 2992 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 2993 if (AArch64_AM::getShiftType(Shift) != AArch64_AM::ShiftExtendType::LSL) 2994 return false; 2995 Shift = AArch64_AM::getShiftValue(Shift); 2996 if (!OptSize) { 2997 if ((Shift != 2 && Shift != 3) || !Subtarget.hasAddrLSLFast()) 2998 return false; 2999 if (avoidSlowSTRQ(MemI)) 3000 return false; 3001 } 3002 return canFoldAddRegIntoAddrMode(1ULL << Shift); 3003 } 3004 3005 case AArch64::ADDXrr: 3006 // add Xa, Xn, Xm 3007 // ldr Xd, [Xa] 3008 // -> 3009 // ldr Xd, [Xn, Xm, lsl #0] 3010 3011 // Don't fold the add if the result would be slower, unless optimising for 3012 // size. 3013 if (!OptSize && avoidSlowSTRQ(MemI)) 3014 return false; 3015 return canFoldAddRegIntoAddrMode(1); 3016 3017 case AArch64::ADDXrx: 3018 // add Xa, Xn, Wm, {s,u}xtw #N 3019 // ldr Xd, [Xa] 3020 // -> 3021 // ldr Xd, [Xn, Wm, {s,u}xtw #N] 3022 3023 // Don't fold the add if the result would be slower, unless optimising for 3024 // size. 3025 if (!OptSize && avoidSlowSTRQ(MemI)) 3026 return false; 3027 3028 // Can fold only sign-/zero-extend of a word. 3029 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(3).getImm()); 3030 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm); 3031 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW) 3032 return false; 3033 3034 return canFoldAddRegIntoAddrMode( 3035 1ULL << AArch64_AM::getArithShiftValue(Imm), 3036 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg 3037 : ExtAddrMode::Formula::ZExtScaledReg); 3038 } 3039 } 3040 3041 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, 3042 // return the opcode of an instruction performing the same operation, but using 3043 // the [Reg, Reg] addressing mode. 3044 static unsigned regOffsetOpcode(unsigned Opcode) { 3045 switch (Opcode) { 3046 default: 3047 llvm_unreachable("Address folding not implemented for instruction"); 3048 3049 case AArch64::LDURQi: 3050 case AArch64::LDRQui: 3051 return AArch64::LDRQroX; 3052 case AArch64::STURQi: 3053 case AArch64::STRQui: 3054 return AArch64::STRQroX; 3055 case AArch64::LDURDi: 3056 case AArch64::LDRDui: 3057 return AArch64::LDRDroX; 3058 case AArch64::STURDi: 3059 case AArch64::STRDui: 3060 return AArch64::STRDroX; 3061 case AArch64::LDURXi: 3062 case AArch64::LDRXui: 3063 return AArch64::LDRXroX; 3064 case AArch64::STURXi: 3065 case AArch64::STRXui: 3066 return AArch64::STRXroX; 3067 case AArch64::LDURWi: 3068 case AArch64::LDRWui: 3069 return AArch64::LDRWroX; 3070 case AArch64::LDURSWi: 3071 case AArch64::LDRSWui: 3072 return AArch64::LDRSWroX; 3073 case AArch64::STURWi: 3074 case AArch64::STRWui: 3075 return AArch64::STRWroX; 3076 case AArch64::LDURHi: 3077 case AArch64::LDRHui: 3078 return AArch64::LDRHroX; 3079 case AArch64::STURHi: 3080 case AArch64::STRHui: 3081 return AArch64::STRHroX; 3082 case AArch64::LDURHHi: 3083 case AArch64::LDRHHui: 3084 return AArch64::LDRHHroX; 3085 case AArch64::STURHHi: 3086 case AArch64::STRHHui: 3087 return AArch64::STRHHroX; 3088 case AArch64::LDURSHXi: 3089 case AArch64::LDRSHXui: 3090 return AArch64::LDRSHXroX; 3091 case AArch64::LDURSHWi: 3092 case AArch64::LDRSHWui: 3093 return AArch64::LDRSHWroX; 3094 case AArch64::LDURBi: 3095 case AArch64::LDRBui: 3096 return AArch64::LDRBroX; 3097 case AArch64::LDURBBi: 3098 case AArch64::LDRBBui: 3099 return AArch64::LDRBBroX; 3100 case AArch64::LDURSBXi: 3101 case AArch64::LDRSBXui: 3102 return AArch64::LDRSBXroX; 3103 case AArch64::LDURSBWi: 3104 case AArch64::LDRSBWui: 3105 return AArch64::LDRSBWroX; 3106 case AArch64::STURBi: 3107 case AArch64::STRBui: 3108 return AArch64::STRBroX; 3109 case AArch64::STURBBi: 3110 case AArch64::STRBBui: 3111 return AArch64::STRBBroX; 3112 } 3113 } 3114 3115 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3116 // the opcode of an instruction performing the same operation, but using the 3117 // [Reg, #Imm] addressing mode with scaled offset. 3118 unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) { 3119 switch (Opcode) { 3120 default: 3121 llvm_unreachable("Address folding not implemented for instruction"); 3122 3123 case AArch64::LDURQi: 3124 Scale = 16; 3125 return AArch64::LDRQui; 3126 case AArch64::STURQi: 3127 Scale = 16; 3128 return AArch64::STRQui; 3129 case AArch64::LDURDi: 3130 Scale = 8; 3131 return AArch64::LDRDui; 3132 case AArch64::STURDi: 3133 Scale = 8; 3134 return AArch64::STRDui; 3135 case AArch64::LDURXi: 3136 Scale = 8; 3137 return AArch64::LDRXui; 3138 case AArch64::STURXi: 3139 Scale = 8; 3140 return AArch64::STRXui; 3141 case AArch64::LDURWi: 3142 Scale = 4; 3143 return AArch64::LDRWui; 3144 case AArch64::LDURSWi: 3145 Scale = 4; 3146 return AArch64::LDRSWui; 3147 case AArch64::STURWi: 3148 Scale = 4; 3149 return AArch64::STRWui; 3150 case AArch64::LDURHi: 3151 Scale = 2; 3152 return AArch64::LDRHui; 3153 case AArch64::STURHi: 3154 Scale = 2; 3155 return AArch64::STRHui; 3156 case AArch64::LDURHHi: 3157 Scale = 2; 3158 return AArch64::LDRHHui; 3159 case AArch64::STURHHi: 3160 Scale = 2; 3161 return AArch64::STRHHui; 3162 case AArch64::LDURSHXi: 3163 Scale = 2; 3164 return AArch64::LDRSHXui; 3165 case AArch64::LDURSHWi: 3166 Scale = 2; 3167 return AArch64::LDRSHWui; 3168 case AArch64::LDURBi: 3169 Scale = 1; 3170 return AArch64::LDRBui; 3171 case AArch64::LDURBBi: 3172 Scale = 1; 3173 return AArch64::LDRBBui; 3174 case AArch64::LDURSBXi: 3175 Scale = 1; 3176 return AArch64::LDRSBXui; 3177 case AArch64::LDURSBWi: 3178 Scale = 1; 3179 return AArch64::LDRSBWui; 3180 case AArch64::STURBi: 3181 Scale = 1; 3182 return AArch64::STRBui; 3183 case AArch64::STURBBi: 3184 Scale = 1; 3185 return AArch64::STRBBui; 3186 case AArch64::LDRQui: 3187 case AArch64::STRQui: 3188 Scale = 16; 3189 return Opcode; 3190 case AArch64::LDRDui: 3191 case AArch64::STRDui: 3192 case AArch64::LDRXui: 3193 case AArch64::STRXui: 3194 Scale = 8; 3195 return Opcode; 3196 case AArch64::LDRWui: 3197 case AArch64::LDRSWui: 3198 case AArch64::STRWui: 3199 Scale = 4; 3200 return Opcode; 3201 case AArch64::LDRHui: 3202 case AArch64::STRHui: 3203 case AArch64::LDRHHui: 3204 case AArch64::STRHHui: 3205 case AArch64::LDRSHXui: 3206 case AArch64::LDRSHWui: 3207 Scale = 2; 3208 return Opcode; 3209 case AArch64::LDRBui: 3210 case AArch64::LDRBBui: 3211 case AArch64::LDRSBXui: 3212 case AArch64::LDRSBWui: 3213 case AArch64::STRBui: 3214 case AArch64::STRBBui: 3215 Scale = 1; 3216 return Opcode; 3217 } 3218 } 3219 3220 // Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return 3221 // the opcode of an instruction performing the same operation, but using the 3222 // [Reg, #Imm] addressing mode with unscaled offset. 3223 unsigned unscaledOffsetOpcode(unsigned Opcode) { 3224 switch (Opcode) { 3225 default: 3226 llvm_unreachable("Address folding not implemented for instruction"); 3227 3228 case AArch64::LDURQi: 3229 case AArch64::STURQi: 3230 case AArch64::LDURDi: 3231 case AArch64::STURDi: 3232 case AArch64::LDURXi: 3233 case AArch64::STURXi: 3234 case AArch64::LDURWi: 3235 case AArch64::LDURSWi: 3236 case AArch64::STURWi: 3237 case AArch64::LDURHi: 3238 case AArch64::STURHi: 3239 case AArch64::LDURHHi: 3240 case AArch64::STURHHi: 3241 case AArch64::LDURSHXi: 3242 case AArch64::LDURSHWi: 3243 case AArch64::LDURBi: 3244 case AArch64::STURBi: 3245 case AArch64::LDURBBi: 3246 case AArch64::STURBBi: 3247 case AArch64::LDURSBWi: 3248 case AArch64::LDURSBXi: 3249 return Opcode; 3250 case AArch64::LDRQui: 3251 return AArch64::LDURQi; 3252 case AArch64::STRQui: 3253 return AArch64::STURQi; 3254 case AArch64::LDRDui: 3255 return AArch64::LDURDi; 3256 case AArch64::STRDui: 3257 return AArch64::STURDi; 3258 case AArch64::LDRXui: 3259 return AArch64::LDURXi; 3260 case AArch64::STRXui: 3261 return AArch64::STURXi; 3262 case AArch64::LDRWui: 3263 return AArch64::LDURWi; 3264 case AArch64::LDRSWui: 3265 return AArch64::LDURSWi; 3266 case AArch64::STRWui: 3267 return AArch64::STURWi; 3268 case AArch64::LDRHui: 3269 return AArch64::LDURHi; 3270 case AArch64::STRHui: 3271 return AArch64::STURHi; 3272 case AArch64::LDRHHui: 3273 return AArch64::LDURHHi; 3274 case AArch64::STRHHui: 3275 return AArch64::STURHHi; 3276 case AArch64::LDRSHXui: 3277 return AArch64::LDURSHXi; 3278 case AArch64::LDRSHWui: 3279 return AArch64::LDURSHWi; 3280 case AArch64::LDRBBui: 3281 return AArch64::LDURBBi; 3282 case AArch64::LDRBui: 3283 return AArch64::LDURBi; 3284 case AArch64::STRBBui: 3285 return AArch64::STURBBi; 3286 case AArch64::STRBui: 3287 return AArch64::STURBi; 3288 case AArch64::LDRSBWui: 3289 return AArch64::LDURSBWi; 3290 case AArch64::LDRSBXui: 3291 return AArch64::LDURSBXi; 3292 } 3293 } 3294 3295 // Given the opcode of a memory load/store instruction, return the opcode of an 3296 // instruction performing the same operation, but using 3297 // the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the 3298 // offset register. 3299 static unsigned offsetExtendOpcode(unsigned Opcode) { 3300 switch (Opcode) { 3301 default: 3302 llvm_unreachable("Address folding not implemented for instruction"); 3303 3304 case AArch64::LDRQroX: 3305 case AArch64::LDURQi: 3306 case AArch64::LDRQui: 3307 return AArch64::LDRQroW; 3308 case AArch64::STRQroX: 3309 case AArch64::STURQi: 3310 case AArch64::STRQui: 3311 return AArch64::STRQroW; 3312 case AArch64::LDRDroX: 3313 case AArch64::LDURDi: 3314 case AArch64::LDRDui: 3315 return AArch64::LDRDroW; 3316 case AArch64::STRDroX: 3317 case AArch64::STURDi: 3318 case AArch64::STRDui: 3319 return AArch64::STRDroW; 3320 case AArch64::LDRXroX: 3321 case AArch64::LDURXi: 3322 case AArch64::LDRXui: 3323 return AArch64::LDRXroW; 3324 case AArch64::STRXroX: 3325 case AArch64::STURXi: 3326 case AArch64::STRXui: 3327 return AArch64::STRXroW; 3328 case AArch64::LDRWroX: 3329 case AArch64::LDURWi: 3330 case AArch64::LDRWui: 3331 return AArch64::LDRWroW; 3332 case AArch64::LDRSWroX: 3333 case AArch64::LDURSWi: 3334 case AArch64::LDRSWui: 3335 return AArch64::LDRSWroW; 3336 case AArch64::STRWroX: 3337 case AArch64::STURWi: 3338 case AArch64::STRWui: 3339 return AArch64::STRWroW; 3340 case AArch64::LDRHroX: 3341 case AArch64::LDURHi: 3342 case AArch64::LDRHui: 3343 return AArch64::LDRHroW; 3344 case AArch64::STRHroX: 3345 case AArch64::STURHi: 3346 case AArch64::STRHui: 3347 return AArch64::STRHroW; 3348 case AArch64::LDRHHroX: 3349 case AArch64::LDURHHi: 3350 case AArch64::LDRHHui: 3351 return AArch64::LDRHHroW; 3352 case AArch64::STRHHroX: 3353 case AArch64::STURHHi: 3354 case AArch64::STRHHui: 3355 return AArch64::STRHHroW; 3356 case AArch64::LDRSHXroX: 3357 case AArch64::LDURSHXi: 3358 case AArch64::LDRSHXui: 3359 return AArch64::LDRSHXroW; 3360 case AArch64::LDRSHWroX: 3361 case AArch64::LDURSHWi: 3362 case AArch64::LDRSHWui: 3363 return AArch64::LDRSHWroW; 3364 case AArch64::LDRBroX: 3365 case AArch64::LDURBi: 3366 case AArch64::LDRBui: 3367 return AArch64::LDRBroW; 3368 case AArch64::LDRBBroX: 3369 case AArch64::LDURBBi: 3370 case AArch64::LDRBBui: 3371 return AArch64::LDRBBroW; 3372 case AArch64::LDRSBXroX: 3373 case AArch64::LDURSBXi: 3374 case AArch64::LDRSBXui: 3375 return AArch64::LDRSBXroW; 3376 case AArch64::LDRSBWroX: 3377 case AArch64::LDURSBWi: 3378 case AArch64::LDRSBWui: 3379 return AArch64::LDRSBWroW; 3380 case AArch64::STRBroX: 3381 case AArch64::STURBi: 3382 case AArch64::STRBui: 3383 return AArch64::STRBroW; 3384 case AArch64::STRBBroX: 3385 case AArch64::STURBBi: 3386 case AArch64::STRBBui: 3387 return AArch64::STRBBroW; 3388 } 3389 } 3390 3391 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, 3392 const ExtAddrMode &AM) const { 3393 3394 const DebugLoc &DL = MemI.getDebugLoc(); 3395 MachineBasicBlock &MBB = *MemI.getParent(); 3396 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo(); 3397 3398 if (AM.Form == ExtAddrMode::Formula::Basic) { 3399 if (AM.ScaledReg) { 3400 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`. 3401 unsigned Opcode = regOffsetOpcode(MemI.getOpcode()); 3402 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3403 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3404 .addReg(MemI.getOperand(0).getReg(), 3405 MemI.mayLoad() ? RegState::Define : 0) 3406 .addReg(AM.BaseReg) 3407 .addReg(AM.ScaledReg) 3408 .addImm(0) 3409 .addImm(AM.Scale > 1) 3410 .setMemRefs(MemI.memoperands()) 3411 .setMIFlags(MemI.getFlags()); 3412 return B.getInstr(); 3413 } 3414 3415 assert(AM.ScaledReg == 0 && AM.Scale == 0 && 3416 "Addressing mode not supported for folding"); 3417 3418 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`. 3419 unsigned Scale = 1; 3420 unsigned Opcode = MemI.getOpcode(); 3421 if (isInt<9>(AM.Displacement)) 3422 Opcode = unscaledOffsetOpcode(Opcode); 3423 else 3424 Opcode = scaledOffsetOpcode(Opcode, Scale); 3425 3426 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3427 .addReg(MemI.getOperand(0).getReg(), 3428 MemI.mayLoad() ? RegState::Define : 0) 3429 .addReg(AM.BaseReg) 3430 .addImm(AM.Displacement / Scale) 3431 .setMemRefs(MemI.memoperands()) 3432 .setMIFlags(MemI.getFlags()); 3433 return B.getInstr(); 3434 } 3435 3436 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg || 3437 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) { 3438 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`. 3439 assert(AM.ScaledReg && !AM.Displacement && 3440 "Address offset can be a register or an immediate, but not both"); 3441 unsigned Opcode = offsetExtendOpcode(MemI.getOpcode()); 3442 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass); 3443 // Make sure the offset register is in the correct register class. 3444 Register OffsetReg = AM.ScaledReg; 3445 const TargetRegisterClass *RC = MRI.getRegClass(OffsetReg); 3446 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) { 3447 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass); 3448 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg) 3449 .addReg(AM.ScaledReg, 0, AArch64::sub_32); 3450 } 3451 auto B = BuildMI(MBB, MemI, DL, get(Opcode)) 3452 .addReg(MemI.getOperand(0).getReg(), 3453 MemI.mayLoad() ? RegState::Define : 0) 3454 .addReg(AM.BaseReg) 3455 .addReg(OffsetReg) 3456 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg) 3457 .addImm(AM.Scale != 1) 3458 .setMemRefs(MemI.memoperands()) 3459 .setMIFlags(MemI.getFlags()); 3460 3461 return B.getInstr(); 3462 } 3463 3464 llvm_unreachable( 3465 "Function must not be called with an addressing mode it can't handle"); 3466 } 3467 3468 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 3469 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 3470 bool &OffsetIsScalable, TypeSize &Width, 3471 const TargetRegisterInfo *TRI) const { 3472 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3473 // Handle only loads/stores with base register followed by immediate offset. 3474 if (LdSt.getNumExplicitOperands() == 3) { 3475 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 3476 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 3477 !LdSt.getOperand(2).isImm()) 3478 return false; 3479 } else if (LdSt.getNumExplicitOperands() == 4) { 3480 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 3481 if (!LdSt.getOperand(1).isReg() || 3482 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 3483 !LdSt.getOperand(3).isImm()) 3484 return false; 3485 } else 3486 return false; 3487 3488 // Get the scaling factor for the instruction and set the width for the 3489 // instruction. 3490 TypeSize Scale(0U, false); 3491 int64_t Dummy1, Dummy2; 3492 3493 // If this returns false, then it's an instruction we don't want to handle. 3494 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 3495 return false; 3496 3497 // Compute the offset. Offset is calculated as the immediate operand 3498 // multiplied by the scaling factor. Unscaled instructions have scaling factor 3499 // set to 1. 3500 if (LdSt.getNumExplicitOperands() == 3) { 3501 BaseOp = &LdSt.getOperand(1); 3502 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 3503 } else { 3504 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 3505 BaseOp = &LdSt.getOperand(2); 3506 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 3507 } 3508 OffsetIsScalable = Scale.isScalable(); 3509 3510 if (!BaseOp->isReg() && !BaseOp->isFI()) 3511 return false; 3512 3513 return true; 3514 } 3515 3516 MachineOperand & 3517 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 3518 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 3519 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 3520 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 3521 return OfsOp; 3522 } 3523 3524 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 3525 TypeSize &Width, int64_t &MinOffset, 3526 int64_t &MaxOffset) { 3527 switch (Opcode) { 3528 // Not a memory operation or something we want to handle. 3529 default: 3530 Scale = TypeSize::getFixed(0); 3531 Width = TypeSize::getFixed(0); 3532 MinOffset = MaxOffset = 0; 3533 return false; 3534 case AArch64::STRWpost: 3535 case AArch64::LDRWpost: 3536 Width = TypeSize::getFixed(32); 3537 Scale = TypeSize::getFixed(4); 3538 MinOffset = -256; 3539 MaxOffset = 255; 3540 break; 3541 case AArch64::LDURQi: 3542 case AArch64::STURQi: 3543 Width = TypeSize::getFixed(16); 3544 Scale = TypeSize::getFixed(1); 3545 MinOffset = -256; 3546 MaxOffset = 255; 3547 break; 3548 case AArch64::PRFUMi: 3549 case AArch64::LDURXi: 3550 case AArch64::LDURDi: 3551 case AArch64::LDAPURXi: 3552 case AArch64::STURXi: 3553 case AArch64::STURDi: 3554 case AArch64::STLURXi: 3555 Width = TypeSize::getFixed(8); 3556 Scale = TypeSize::getFixed(1); 3557 MinOffset = -256; 3558 MaxOffset = 255; 3559 break; 3560 case AArch64::LDURWi: 3561 case AArch64::LDURSi: 3562 case AArch64::LDURSWi: 3563 case AArch64::LDAPURi: 3564 case AArch64::LDAPURSWi: 3565 case AArch64::STURWi: 3566 case AArch64::STURSi: 3567 case AArch64::STLURWi: 3568 Width = TypeSize::getFixed(4); 3569 Scale = TypeSize::getFixed(1); 3570 MinOffset = -256; 3571 MaxOffset = 255; 3572 break; 3573 case AArch64::LDURHi: 3574 case AArch64::LDURHHi: 3575 case AArch64::LDURSHXi: 3576 case AArch64::LDURSHWi: 3577 case AArch64::LDAPURHi: 3578 case AArch64::LDAPURSHWi: 3579 case AArch64::LDAPURSHXi: 3580 case AArch64::STURHi: 3581 case AArch64::STURHHi: 3582 case AArch64::STLURHi: 3583 Width = TypeSize::getFixed(2); 3584 Scale = TypeSize::getFixed(1); 3585 MinOffset = -256; 3586 MaxOffset = 255; 3587 break; 3588 case AArch64::LDURBi: 3589 case AArch64::LDURBBi: 3590 case AArch64::LDURSBXi: 3591 case AArch64::LDURSBWi: 3592 case AArch64::LDAPURBi: 3593 case AArch64::LDAPURSBWi: 3594 case AArch64::LDAPURSBXi: 3595 case AArch64::STURBi: 3596 case AArch64::STURBBi: 3597 case AArch64::STLURBi: 3598 Width = TypeSize::getFixed(1); 3599 Scale = TypeSize::getFixed(1); 3600 MinOffset = -256; 3601 MaxOffset = 255; 3602 break; 3603 case AArch64::LDPQi: 3604 case AArch64::LDNPQi: 3605 case AArch64::STPQi: 3606 case AArch64::STNPQi: 3607 Scale = TypeSize::getFixed(16); 3608 Width = TypeSize::getFixed(32); 3609 MinOffset = -64; 3610 MaxOffset = 63; 3611 break; 3612 case AArch64::LDRQui: 3613 case AArch64::STRQui: 3614 Scale = TypeSize::getFixed(16); 3615 Width = TypeSize::getFixed(16); 3616 MinOffset = 0; 3617 MaxOffset = 4095; 3618 break; 3619 case AArch64::LDPXi: 3620 case AArch64::LDPDi: 3621 case AArch64::LDNPXi: 3622 case AArch64::LDNPDi: 3623 case AArch64::STPXi: 3624 case AArch64::STPDi: 3625 case AArch64::STNPXi: 3626 case AArch64::STNPDi: 3627 Scale = TypeSize::getFixed(8); 3628 Width = TypeSize::getFixed(16); 3629 MinOffset = -64; 3630 MaxOffset = 63; 3631 break; 3632 case AArch64::PRFMui: 3633 case AArch64::LDRXui: 3634 case AArch64::LDRDui: 3635 case AArch64::STRXui: 3636 case AArch64::STRDui: 3637 Scale = TypeSize::getFixed(8); 3638 Width = TypeSize::getFixed(8); 3639 MinOffset = 0; 3640 MaxOffset = 4095; 3641 break; 3642 case AArch64::StoreSwiftAsyncContext: 3643 // Store is an STRXui, but there might be an ADDXri in the expansion too. 3644 Scale = TypeSize::getFixed(1); 3645 Width = TypeSize::getFixed(8); 3646 MinOffset = 0; 3647 MaxOffset = 4095; 3648 break; 3649 case AArch64::LDPWi: 3650 case AArch64::LDPSi: 3651 case AArch64::LDNPWi: 3652 case AArch64::LDNPSi: 3653 case AArch64::STPWi: 3654 case AArch64::STPSi: 3655 case AArch64::STNPWi: 3656 case AArch64::STNPSi: 3657 Scale = TypeSize::getFixed(4); 3658 Width = TypeSize::getFixed(8); 3659 MinOffset = -64; 3660 MaxOffset = 63; 3661 break; 3662 case AArch64::LDRWui: 3663 case AArch64::LDRSui: 3664 case AArch64::LDRSWui: 3665 case AArch64::STRWui: 3666 case AArch64::STRSui: 3667 Scale = TypeSize::getFixed(4); 3668 Width = TypeSize::getFixed(4); 3669 MinOffset = 0; 3670 MaxOffset = 4095; 3671 break; 3672 case AArch64::LDRHui: 3673 case AArch64::LDRHHui: 3674 case AArch64::LDRSHWui: 3675 case AArch64::LDRSHXui: 3676 case AArch64::STRHui: 3677 case AArch64::STRHHui: 3678 Scale = TypeSize::getFixed(2); 3679 Width = TypeSize::getFixed(2); 3680 MinOffset = 0; 3681 MaxOffset = 4095; 3682 break; 3683 case AArch64::LDRBui: 3684 case AArch64::LDRBBui: 3685 case AArch64::LDRSBWui: 3686 case AArch64::LDRSBXui: 3687 case AArch64::STRBui: 3688 case AArch64::STRBBui: 3689 Scale = TypeSize::getFixed(1); 3690 Width = TypeSize::getFixed(1); 3691 MinOffset = 0; 3692 MaxOffset = 4095; 3693 break; 3694 case AArch64::STPXpre: 3695 case AArch64::LDPXpost: 3696 case AArch64::STPDpre: 3697 case AArch64::LDPDpost: 3698 Scale = TypeSize::getFixed(8); 3699 Width = TypeSize::getFixed(8); 3700 MinOffset = -512; 3701 MaxOffset = 504; 3702 break; 3703 case AArch64::STPQpre: 3704 case AArch64::LDPQpost: 3705 Scale = TypeSize::getFixed(16); 3706 Width = TypeSize::getFixed(16); 3707 MinOffset = -1024; 3708 MaxOffset = 1008; 3709 break; 3710 case AArch64::STRXpre: 3711 case AArch64::STRDpre: 3712 case AArch64::LDRXpost: 3713 case AArch64::LDRDpost: 3714 Scale = TypeSize::getFixed(1); 3715 Width = TypeSize::getFixed(8); 3716 MinOffset = -256; 3717 MaxOffset = 255; 3718 break; 3719 case AArch64::STRQpre: 3720 case AArch64::LDRQpost: 3721 Scale = TypeSize::getFixed(1); 3722 Width = TypeSize::getFixed(16); 3723 MinOffset = -256; 3724 MaxOffset = 255; 3725 break; 3726 case AArch64::ADDG: 3727 Scale = TypeSize::getFixed(16); 3728 Width = TypeSize::getFixed(0); 3729 MinOffset = 0; 3730 MaxOffset = 63; 3731 break; 3732 case AArch64::TAGPstack: 3733 Scale = TypeSize::getFixed(16); 3734 Width = TypeSize::getFixed(0); 3735 // TAGP with a negative offset turns into SUBP, which has a maximum offset 3736 // of 63 (not 64!). 3737 MinOffset = -63; 3738 MaxOffset = 63; 3739 break; 3740 case AArch64::LDG: 3741 case AArch64::STGi: 3742 case AArch64::STZGi: 3743 Scale = TypeSize::getFixed(16); 3744 Width = TypeSize::getFixed(16); 3745 MinOffset = -256; 3746 MaxOffset = 255; 3747 break; 3748 case AArch64::STR_ZZZZXI: 3749 case AArch64::LDR_ZZZZXI: 3750 Scale = TypeSize::getScalable(16); 3751 Width = TypeSize::getScalable(16 * 4); 3752 MinOffset = -256; 3753 MaxOffset = 252; 3754 break; 3755 case AArch64::STR_ZZZXI: 3756 case AArch64::LDR_ZZZXI: 3757 Scale = TypeSize::getScalable(16); 3758 Width = TypeSize::getScalable(16 * 3); 3759 MinOffset = -256; 3760 MaxOffset = 253; 3761 break; 3762 case AArch64::STR_ZZXI: 3763 case AArch64::LDR_ZZXI: 3764 Scale = TypeSize::getScalable(16); 3765 Width = TypeSize::getScalable(16 * 2); 3766 MinOffset = -256; 3767 MaxOffset = 254; 3768 break; 3769 case AArch64::LDR_PXI: 3770 case AArch64::STR_PXI: 3771 Scale = TypeSize::getScalable(2); 3772 Width = TypeSize::getScalable(2); 3773 MinOffset = -256; 3774 MaxOffset = 255; 3775 break; 3776 case AArch64::LDR_PPXI: 3777 case AArch64::STR_PPXI: 3778 Scale = TypeSize::getScalable(2); 3779 Width = TypeSize::getScalable(2 * 2); 3780 MinOffset = -256; 3781 MaxOffset = 254; 3782 break; 3783 case AArch64::LDR_ZXI: 3784 case AArch64::STR_ZXI: 3785 Scale = TypeSize::getScalable(16); 3786 Width = TypeSize::getScalable(16); 3787 MinOffset = -256; 3788 MaxOffset = 255; 3789 break; 3790 case AArch64::LD1B_IMM: 3791 case AArch64::LD1H_IMM: 3792 case AArch64::LD1W_IMM: 3793 case AArch64::LD1D_IMM: 3794 case AArch64::LDNT1B_ZRI: 3795 case AArch64::LDNT1H_ZRI: 3796 case AArch64::LDNT1W_ZRI: 3797 case AArch64::LDNT1D_ZRI: 3798 case AArch64::ST1B_IMM: 3799 case AArch64::ST1H_IMM: 3800 case AArch64::ST1W_IMM: 3801 case AArch64::ST1D_IMM: 3802 case AArch64::STNT1B_ZRI: 3803 case AArch64::STNT1H_ZRI: 3804 case AArch64::STNT1W_ZRI: 3805 case AArch64::STNT1D_ZRI: 3806 case AArch64::LDNF1B_IMM: 3807 case AArch64::LDNF1H_IMM: 3808 case AArch64::LDNF1W_IMM: 3809 case AArch64::LDNF1D_IMM: 3810 // A full vectors worth of data 3811 // Width = mbytes * elements 3812 Scale = TypeSize::getScalable(16); 3813 Width = TypeSize::getScalable(16); 3814 MinOffset = -8; 3815 MaxOffset = 7; 3816 break; 3817 case AArch64::LD2B_IMM: 3818 case AArch64::LD2H_IMM: 3819 case AArch64::LD2W_IMM: 3820 case AArch64::LD2D_IMM: 3821 case AArch64::ST2B_IMM: 3822 case AArch64::ST2H_IMM: 3823 case AArch64::ST2W_IMM: 3824 case AArch64::ST2D_IMM: 3825 Scale = TypeSize::getScalable(32); 3826 Width = TypeSize::getScalable(16 * 2); 3827 MinOffset = -8; 3828 MaxOffset = 7; 3829 break; 3830 case AArch64::LD3B_IMM: 3831 case AArch64::LD3H_IMM: 3832 case AArch64::LD3W_IMM: 3833 case AArch64::LD3D_IMM: 3834 case AArch64::ST3B_IMM: 3835 case AArch64::ST3H_IMM: 3836 case AArch64::ST3W_IMM: 3837 case AArch64::ST3D_IMM: 3838 Scale = TypeSize::getScalable(48); 3839 Width = TypeSize::getScalable(16 * 3); 3840 MinOffset = -8; 3841 MaxOffset = 7; 3842 break; 3843 case AArch64::LD4B_IMM: 3844 case AArch64::LD4H_IMM: 3845 case AArch64::LD4W_IMM: 3846 case AArch64::LD4D_IMM: 3847 case AArch64::ST4B_IMM: 3848 case AArch64::ST4H_IMM: 3849 case AArch64::ST4W_IMM: 3850 case AArch64::ST4D_IMM: 3851 Scale = TypeSize::getScalable(64); 3852 Width = TypeSize::getScalable(16 * 4); 3853 MinOffset = -8; 3854 MaxOffset = 7; 3855 break; 3856 case AArch64::LD1B_H_IMM: 3857 case AArch64::LD1SB_H_IMM: 3858 case AArch64::LD1H_S_IMM: 3859 case AArch64::LD1SH_S_IMM: 3860 case AArch64::LD1W_D_IMM: 3861 case AArch64::LD1SW_D_IMM: 3862 case AArch64::ST1B_H_IMM: 3863 case AArch64::ST1H_S_IMM: 3864 case AArch64::ST1W_D_IMM: 3865 case AArch64::LDNF1B_H_IMM: 3866 case AArch64::LDNF1SB_H_IMM: 3867 case AArch64::LDNF1H_S_IMM: 3868 case AArch64::LDNF1SH_S_IMM: 3869 case AArch64::LDNF1W_D_IMM: 3870 case AArch64::LDNF1SW_D_IMM: 3871 // A half vector worth of data 3872 // Width = mbytes * elements 3873 Scale = TypeSize::getScalable(8); 3874 Width = TypeSize::getScalable(8); 3875 MinOffset = -8; 3876 MaxOffset = 7; 3877 break; 3878 case AArch64::LD1B_S_IMM: 3879 case AArch64::LD1SB_S_IMM: 3880 case AArch64::LD1H_D_IMM: 3881 case AArch64::LD1SH_D_IMM: 3882 case AArch64::ST1B_S_IMM: 3883 case AArch64::ST1H_D_IMM: 3884 case AArch64::LDNF1B_S_IMM: 3885 case AArch64::LDNF1SB_S_IMM: 3886 case AArch64::LDNF1H_D_IMM: 3887 case AArch64::LDNF1SH_D_IMM: 3888 // A quarter vector worth of data 3889 // Width = mbytes * elements 3890 Scale = TypeSize::getScalable(4); 3891 Width = TypeSize::getScalable(4); 3892 MinOffset = -8; 3893 MaxOffset = 7; 3894 break; 3895 case AArch64::LD1B_D_IMM: 3896 case AArch64::LD1SB_D_IMM: 3897 case AArch64::ST1B_D_IMM: 3898 case AArch64::LDNF1B_D_IMM: 3899 case AArch64::LDNF1SB_D_IMM: 3900 // A eighth vector worth of data 3901 // Width = mbytes * elements 3902 Scale = TypeSize::getScalable(2); 3903 Width = TypeSize::getScalable(2); 3904 MinOffset = -8; 3905 MaxOffset = 7; 3906 break; 3907 case AArch64::ST2Gi: 3908 case AArch64::STZ2Gi: 3909 Scale = TypeSize::getFixed(16); 3910 Width = TypeSize::getFixed(32); 3911 MinOffset = -256; 3912 MaxOffset = 255; 3913 break; 3914 case AArch64::STGPi: 3915 Scale = TypeSize::getFixed(16); 3916 Width = TypeSize::getFixed(16); 3917 MinOffset = -64; 3918 MaxOffset = 63; 3919 break; 3920 case AArch64::LD1RB_IMM: 3921 case AArch64::LD1RB_H_IMM: 3922 case AArch64::LD1RB_S_IMM: 3923 case AArch64::LD1RB_D_IMM: 3924 case AArch64::LD1RSB_H_IMM: 3925 case AArch64::LD1RSB_S_IMM: 3926 case AArch64::LD1RSB_D_IMM: 3927 Scale = TypeSize::getFixed(1); 3928 Width = TypeSize::getFixed(1); 3929 MinOffset = 0; 3930 MaxOffset = 63; 3931 break; 3932 case AArch64::LD1RH_IMM: 3933 case AArch64::LD1RH_S_IMM: 3934 case AArch64::LD1RH_D_IMM: 3935 case AArch64::LD1RSH_S_IMM: 3936 case AArch64::LD1RSH_D_IMM: 3937 Scale = TypeSize::getFixed(2); 3938 Width = TypeSize::getFixed(2); 3939 MinOffset = 0; 3940 MaxOffset = 63; 3941 break; 3942 case AArch64::LD1RW_IMM: 3943 case AArch64::LD1RW_D_IMM: 3944 case AArch64::LD1RSW_IMM: 3945 Scale = TypeSize::getFixed(4); 3946 Width = TypeSize::getFixed(4); 3947 MinOffset = 0; 3948 MaxOffset = 63; 3949 break; 3950 case AArch64::LD1RD_IMM: 3951 Scale = TypeSize::getFixed(8); 3952 Width = TypeSize::getFixed(8); 3953 MinOffset = 0; 3954 MaxOffset = 63; 3955 break; 3956 } 3957 3958 return true; 3959 } 3960 3961 // Scaling factor for unscaled load or store. 3962 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3963 switch (Opc) { 3964 default: 3965 llvm_unreachable("Opcode has unknown scale!"); 3966 case AArch64::LDRBBui: 3967 case AArch64::LDURBBi: 3968 case AArch64::LDRSBWui: 3969 case AArch64::LDURSBWi: 3970 case AArch64::STRBBui: 3971 case AArch64::STURBBi: 3972 return 1; 3973 case AArch64::LDRHHui: 3974 case AArch64::LDURHHi: 3975 case AArch64::LDRSHWui: 3976 case AArch64::LDURSHWi: 3977 case AArch64::STRHHui: 3978 case AArch64::STURHHi: 3979 return 2; 3980 case AArch64::LDRSui: 3981 case AArch64::LDURSi: 3982 case AArch64::LDRSpre: 3983 case AArch64::LDRSWui: 3984 case AArch64::LDURSWi: 3985 case AArch64::LDRSWpre: 3986 case AArch64::LDRWpre: 3987 case AArch64::LDRWui: 3988 case AArch64::LDURWi: 3989 case AArch64::STRSui: 3990 case AArch64::STURSi: 3991 case AArch64::STRSpre: 3992 case AArch64::STRWui: 3993 case AArch64::STURWi: 3994 case AArch64::STRWpre: 3995 case AArch64::LDPSi: 3996 case AArch64::LDPSWi: 3997 case AArch64::LDPWi: 3998 case AArch64::STPSi: 3999 case AArch64::STPWi: 4000 return 4; 4001 case AArch64::LDRDui: 4002 case AArch64::LDURDi: 4003 case AArch64::LDRDpre: 4004 case AArch64::LDRXui: 4005 case AArch64::LDURXi: 4006 case AArch64::LDRXpre: 4007 case AArch64::STRDui: 4008 case AArch64::STURDi: 4009 case AArch64::STRDpre: 4010 case AArch64::STRXui: 4011 case AArch64::STURXi: 4012 case AArch64::STRXpre: 4013 case AArch64::LDPDi: 4014 case AArch64::LDPXi: 4015 case AArch64::STPDi: 4016 case AArch64::STPXi: 4017 return 8; 4018 case AArch64::LDRQui: 4019 case AArch64::LDURQi: 4020 case AArch64::STRQui: 4021 case AArch64::STURQi: 4022 case AArch64::STRQpre: 4023 case AArch64::LDPQi: 4024 case AArch64::LDRQpre: 4025 case AArch64::STPQi: 4026 case AArch64::STGi: 4027 case AArch64::STZGi: 4028 case AArch64::ST2Gi: 4029 case AArch64::STZ2Gi: 4030 case AArch64::STGPi: 4031 return 16; 4032 } 4033 } 4034 4035 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 4036 switch (MI.getOpcode()) { 4037 default: 4038 return false; 4039 case AArch64::LDRWpre: 4040 case AArch64::LDRXpre: 4041 case AArch64::LDRSWpre: 4042 case AArch64::LDRSpre: 4043 case AArch64::LDRDpre: 4044 case AArch64::LDRQpre: 4045 return true; 4046 } 4047 } 4048 4049 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 4050 switch (MI.getOpcode()) { 4051 default: 4052 return false; 4053 case AArch64::STRWpre: 4054 case AArch64::STRXpre: 4055 case AArch64::STRSpre: 4056 case AArch64::STRDpre: 4057 case AArch64::STRQpre: 4058 return true; 4059 } 4060 } 4061 4062 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 4063 return isPreLd(MI) || isPreSt(MI); 4064 } 4065 4066 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 4067 switch (MI.getOpcode()) { 4068 default: 4069 return false; 4070 case AArch64::LDPSi: 4071 case AArch64::LDPSWi: 4072 case AArch64::LDPDi: 4073 case AArch64::LDPQi: 4074 case AArch64::LDPWi: 4075 case AArch64::LDPXi: 4076 case AArch64::STPSi: 4077 case AArch64::STPDi: 4078 case AArch64::STPQi: 4079 case AArch64::STPWi: 4080 case AArch64::STPXi: 4081 case AArch64::STGPi: 4082 return true; 4083 } 4084 } 4085 4086 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 4087 unsigned Idx = 4088 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 4089 : 1; 4090 return MI.getOperand(Idx); 4091 } 4092 4093 const MachineOperand & 4094 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 4095 unsigned Idx = 4096 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 4097 : 2; 4098 return MI.getOperand(Idx); 4099 } 4100 4101 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 4102 Register Reg) { 4103 if (MI.getParent() == nullptr) 4104 return nullptr; 4105 const MachineFunction *MF = MI.getParent()->getParent(); 4106 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 4107 } 4108 4109 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 4110 auto IsHFPR = [&](const MachineOperand &Op) { 4111 if (!Op.isReg()) 4112 return false; 4113 auto Reg = Op.getReg(); 4114 if (Reg.isPhysical()) 4115 return AArch64::FPR16RegClass.contains(Reg); 4116 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4117 return TRC == &AArch64::FPR16RegClass || 4118 TRC == &AArch64::FPR16_loRegClass; 4119 }; 4120 return llvm::any_of(MI.operands(), IsHFPR); 4121 } 4122 4123 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 4124 auto IsQFPR = [&](const MachineOperand &Op) { 4125 if (!Op.isReg()) 4126 return false; 4127 auto Reg = Op.getReg(); 4128 if (Reg.isPhysical()) 4129 return AArch64::FPR128RegClass.contains(Reg); 4130 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4131 return TRC == &AArch64::FPR128RegClass || 4132 TRC == &AArch64::FPR128_loRegClass; 4133 }; 4134 return llvm::any_of(MI.operands(), IsQFPR); 4135 } 4136 4137 bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) { 4138 switch (MI.getOpcode()) { 4139 case AArch64::BRK: 4140 case AArch64::HLT: 4141 case AArch64::PACIASP: 4142 case AArch64::PACIBSP: 4143 // Implicit BTI behavior. 4144 return true; 4145 case AArch64::PAUTH_PROLOGUE: 4146 // PAUTH_PROLOGUE expands to PACI(A|B)SP. 4147 return true; 4148 case AArch64::HINT: { 4149 unsigned Imm = MI.getOperand(0).getImm(); 4150 // Explicit BTI instruction. 4151 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 4152 return true; 4153 // PACI(A|B)SP instructions. 4154 if (Imm == 25 || Imm == 27) 4155 return true; 4156 return false; 4157 } 4158 default: 4159 return false; 4160 } 4161 } 4162 4163 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 4164 auto IsFPR = [&](const MachineOperand &Op) { 4165 if (!Op.isReg()) 4166 return false; 4167 auto Reg = Op.getReg(); 4168 if (Reg.isPhysical()) 4169 return AArch64::FPR128RegClass.contains(Reg) || 4170 AArch64::FPR64RegClass.contains(Reg) || 4171 AArch64::FPR32RegClass.contains(Reg) || 4172 AArch64::FPR16RegClass.contains(Reg) || 4173 AArch64::FPR8RegClass.contains(Reg); 4174 4175 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 4176 return TRC == &AArch64::FPR128RegClass || 4177 TRC == &AArch64::FPR128_loRegClass || 4178 TRC == &AArch64::FPR64RegClass || 4179 TRC == &AArch64::FPR64_loRegClass || 4180 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 4181 TRC == &AArch64::FPR8RegClass; 4182 }; 4183 return llvm::any_of(MI.operands(), IsFPR); 4184 } 4185 4186 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 4187 // scaled. 4188 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 4189 int Scale = AArch64InstrInfo::getMemScale(Opc); 4190 4191 // If the byte-offset isn't a multiple of the stride, we can't scale this 4192 // offset. 4193 if (Offset % Scale != 0) 4194 return false; 4195 4196 // Convert the byte-offset used by unscaled into an "element" offset used 4197 // by the scaled pair load/store instructions. 4198 Offset /= Scale; 4199 return true; 4200 } 4201 4202 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 4203 if (FirstOpc == SecondOpc) 4204 return true; 4205 // We can also pair sign-ext and zero-ext instructions. 4206 switch (FirstOpc) { 4207 default: 4208 return false; 4209 case AArch64::LDRQui: 4210 case AArch64::LDURQi: 4211 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi; 4212 case AArch64::LDRWui: 4213 case AArch64::LDURWi: 4214 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 4215 case AArch64::LDRSWui: 4216 case AArch64::LDURSWi: 4217 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 4218 } 4219 // These instructions can't be paired based on their opcodes. 4220 return false; 4221 } 4222 4223 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 4224 int64_t Offset1, unsigned Opcode1, int FI2, 4225 int64_t Offset2, unsigned Opcode2) { 4226 // Accesses through fixed stack object frame indices may access a different 4227 // fixed stack slot. Check that the object offsets + offsets match. 4228 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 4229 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 4230 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 4231 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 4232 // Convert to scaled object offsets. 4233 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 4234 if (ObjectOffset1 % Scale1 != 0) 4235 return false; 4236 ObjectOffset1 /= Scale1; 4237 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 4238 if (ObjectOffset2 % Scale2 != 0) 4239 return false; 4240 ObjectOffset2 /= Scale2; 4241 ObjectOffset1 += Offset1; 4242 ObjectOffset2 += Offset2; 4243 return ObjectOffset1 + 1 == ObjectOffset2; 4244 } 4245 4246 return FI1 == FI2; 4247 } 4248 4249 /// Detect opportunities for ldp/stp formation. 4250 /// 4251 /// Only called for LdSt for which getMemOperandWithOffset returns true. 4252 bool AArch64InstrInfo::shouldClusterMemOps( 4253 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1, 4254 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, 4255 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize, 4256 unsigned NumBytes) const { 4257 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 4258 const MachineOperand &BaseOp1 = *BaseOps1.front(); 4259 const MachineOperand &BaseOp2 = *BaseOps2.front(); 4260 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 4261 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 4262 if (BaseOp1.getType() != BaseOp2.getType()) 4263 return false; 4264 4265 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 4266 "Only base registers and frame indices are supported."); 4267 4268 // Check for both base regs and base FI. 4269 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 4270 return false; 4271 4272 // Only cluster up to a single pair. 4273 if (ClusterSize > 2) 4274 return false; 4275 4276 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 4277 return false; 4278 4279 // Can we pair these instructions based on their opcodes? 4280 unsigned FirstOpc = FirstLdSt.getOpcode(); 4281 unsigned SecondOpc = SecondLdSt.getOpcode(); 4282 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 4283 return false; 4284 4285 // Can't merge volatiles or load/stores that have a hint to avoid pair 4286 // formation, for example. 4287 if (!isCandidateToMergeOrPair(FirstLdSt) || 4288 !isCandidateToMergeOrPair(SecondLdSt)) 4289 return false; 4290 4291 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 4292 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 4293 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 4294 return false; 4295 4296 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 4297 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 4298 return false; 4299 4300 // Pairwise instructions have a 7-bit signed offset field. 4301 if (Offset1 > 63 || Offset1 < -64) 4302 return false; 4303 4304 // The caller should already have ordered First/SecondLdSt by offset. 4305 // Note: except for non-equal frame index bases 4306 if (BaseOp1.isFI()) { 4307 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 4308 "Caller should have ordered offsets."); 4309 4310 const MachineFrameInfo &MFI = 4311 FirstLdSt.getParent()->getParent()->getFrameInfo(); 4312 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 4313 BaseOp2.getIndex(), Offset2, SecondOpc); 4314 } 4315 4316 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 4317 4318 return Offset1 + 1 == Offset2; 4319 } 4320 4321 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 4322 unsigned Reg, unsigned SubIdx, 4323 unsigned State, 4324 const TargetRegisterInfo *TRI) { 4325 if (!SubIdx) 4326 return MIB.addReg(Reg, State); 4327 4328 if (Register::isPhysicalRegister(Reg)) 4329 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 4330 return MIB.addReg(Reg, State, SubIdx); 4331 } 4332 4333 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 4334 unsigned NumRegs) { 4335 // We really want the positive remainder mod 32 here, that happens to be 4336 // easily obtainable with a mask. 4337 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 4338 } 4339 4340 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 4341 MachineBasicBlock::iterator I, 4342 const DebugLoc &DL, MCRegister DestReg, 4343 MCRegister SrcReg, bool KillSrc, 4344 unsigned Opcode, 4345 ArrayRef<unsigned> Indices) const { 4346 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 4347 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4348 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4349 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4350 unsigned NumRegs = Indices.size(); 4351 4352 int SubReg = 0, End = NumRegs, Incr = 1; 4353 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 4354 SubReg = NumRegs - 1; 4355 End = -1; 4356 Incr = -1; 4357 } 4358 4359 for (; SubReg != End; SubReg += Incr) { 4360 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4361 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4362 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 4363 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4364 } 4365 } 4366 4367 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 4368 MachineBasicBlock::iterator I, 4369 DebugLoc DL, unsigned DestReg, 4370 unsigned SrcReg, bool KillSrc, 4371 unsigned Opcode, unsigned ZeroReg, 4372 llvm::ArrayRef<unsigned> Indices) const { 4373 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4374 unsigned NumRegs = Indices.size(); 4375 4376 #ifndef NDEBUG 4377 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 4378 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 4379 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 4380 "GPR reg sequences should not be able to overlap"); 4381 #endif 4382 4383 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 4384 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 4385 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 4386 MIB.addReg(ZeroReg); 4387 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 4388 MIB.addImm(0); 4389 } 4390 } 4391 4392 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 4393 MachineBasicBlock::iterator I, 4394 const DebugLoc &DL, MCRegister DestReg, 4395 MCRegister SrcReg, bool KillSrc) const { 4396 if (AArch64::GPR32spRegClass.contains(DestReg) && 4397 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 4398 const TargetRegisterInfo *TRI = &getRegisterInfo(); 4399 4400 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 4401 // If either operand is WSP, expand to ADD #0. 4402 if (Subtarget.hasZeroCycleRegMove()) { 4403 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 4404 MCRegister DestRegX = TRI->getMatchingSuperReg( 4405 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4406 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4407 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4408 // This instruction is reading and writing X registers. This may upset 4409 // the register scavenger and machine verifier, so we need to indicate 4410 // that we are reading an undefined value from SrcRegX, but a proper 4411 // value from SrcReg. 4412 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 4413 .addReg(SrcRegX, RegState::Undef) 4414 .addImm(0) 4415 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 4416 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4417 } else { 4418 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 4419 .addReg(SrcReg, getKillRegState(KillSrc)) 4420 .addImm(0) 4421 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4422 } 4423 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 4424 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 4425 .addImm(0) 4426 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4427 } else { 4428 if (Subtarget.hasZeroCycleRegMove()) { 4429 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 4430 MCRegister DestRegX = TRI->getMatchingSuperReg( 4431 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4432 MCRegister SrcRegX = TRI->getMatchingSuperReg( 4433 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 4434 // This instruction is reading and writing X registers. This may upset 4435 // the register scavenger and machine verifier, so we need to indicate 4436 // that we are reading an undefined value from SrcRegX, but a proper 4437 // value from SrcReg. 4438 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 4439 .addReg(AArch64::XZR) 4440 .addReg(SrcRegX, RegState::Undef) 4441 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 4442 } else { 4443 // Otherwise, expand to ORR WZR. 4444 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 4445 .addReg(AArch64::WZR) 4446 .addReg(SrcReg, getKillRegState(KillSrc)); 4447 } 4448 } 4449 return; 4450 } 4451 4452 // Copy a Predicate register by ORRing with itself. 4453 if (AArch64::PPRRegClass.contains(DestReg) && 4454 AArch64::PPRRegClass.contains(SrcReg)) { 4455 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4456 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 4457 .addReg(SrcReg) // Pg 4458 .addReg(SrcReg) 4459 .addReg(SrcReg, getKillRegState(KillSrc)); 4460 return; 4461 } 4462 4463 // Copy a predicate-as-counter register by ORRing with itself as if it 4464 // were a regular predicate (mask) register. 4465 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg); 4466 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg); 4467 if (DestIsPNR || SrcIsPNR) { 4468 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4469 "Unexpected predicate-as-counter register."); 4470 auto ToPPR = [](MCRegister R) -> MCRegister { 4471 return (R - AArch64::PN0) + AArch64::P0; 4472 }; 4473 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg; 4474 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg; 4475 4476 if (PPRSrcReg != PPRDestReg) { 4477 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg) 4478 .addReg(PPRSrcReg) // Pg 4479 .addReg(PPRSrcReg) 4480 .addReg(PPRSrcReg, getKillRegState(KillSrc)); 4481 if (DestIsPNR) 4482 NewMI.addDef(DestReg, RegState::Implicit); 4483 } 4484 return; 4485 } 4486 4487 // Copy a Z register by ORRing with itself. 4488 if (AArch64::ZPRRegClass.contains(DestReg) && 4489 AArch64::ZPRRegClass.contains(SrcReg)) { 4490 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4491 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 4492 .addReg(SrcReg) 4493 .addReg(SrcReg, getKillRegState(KillSrc)); 4494 return; 4495 } 4496 4497 // Copy a Z register pair by copying the individual sub-registers. 4498 if ((AArch64::ZPR2RegClass.contains(DestReg) || 4499 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) && 4500 (AArch64::ZPR2RegClass.contains(SrcReg) || 4501 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) { 4502 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4503 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 4504 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4505 Indices); 4506 return; 4507 } 4508 4509 // Copy a Z register triple by copying the individual sub-registers. 4510 if (AArch64::ZPR3RegClass.contains(DestReg) && 4511 AArch64::ZPR3RegClass.contains(SrcReg)) { 4512 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4513 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4514 AArch64::zsub2}; 4515 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4516 Indices); 4517 return; 4518 } 4519 4520 // Copy a Z register quad by copying the individual sub-registers. 4521 if ((AArch64::ZPR4RegClass.contains(DestReg) || 4522 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) && 4523 (AArch64::ZPR4RegClass.contains(SrcReg) || 4524 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) { 4525 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 4526 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 4527 AArch64::zsub2, AArch64::zsub3}; 4528 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 4529 Indices); 4530 return; 4531 } 4532 4533 if (AArch64::GPR64spRegClass.contains(DestReg) && 4534 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 4535 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 4536 // If either operand is SP, expand to ADD #0. 4537 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 4538 .addReg(SrcReg, getKillRegState(KillSrc)) 4539 .addImm(0) 4540 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4541 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 4542 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 4543 .addImm(0) 4544 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 4545 } else { 4546 // Otherwise, expand to ORR XZR. 4547 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 4548 .addReg(AArch64::XZR) 4549 .addReg(SrcReg, getKillRegState(KillSrc)); 4550 } 4551 return; 4552 } 4553 4554 // Copy a DDDD register quad by copying the individual sub-registers. 4555 if (AArch64::DDDDRegClass.contains(DestReg) && 4556 AArch64::DDDDRegClass.contains(SrcReg)) { 4557 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4558 AArch64::dsub2, AArch64::dsub3}; 4559 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4560 Indices); 4561 return; 4562 } 4563 4564 // Copy a DDD register triple by copying the individual sub-registers. 4565 if (AArch64::DDDRegClass.contains(DestReg) && 4566 AArch64::DDDRegClass.contains(SrcReg)) { 4567 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 4568 AArch64::dsub2}; 4569 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4570 Indices); 4571 return; 4572 } 4573 4574 // Copy a DD register pair by copying the individual sub-registers. 4575 if (AArch64::DDRegClass.contains(DestReg) && 4576 AArch64::DDRegClass.contains(SrcReg)) { 4577 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 4578 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 4579 Indices); 4580 return; 4581 } 4582 4583 // Copy a QQQQ register quad by copying the individual sub-registers. 4584 if (AArch64::QQQQRegClass.contains(DestReg) && 4585 AArch64::QQQQRegClass.contains(SrcReg)) { 4586 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4587 AArch64::qsub2, AArch64::qsub3}; 4588 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4589 Indices); 4590 return; 4591 } 4592 4593 // Copy a QQQ register triple by copying the individual sub-registers. 4594 if (AArch64::QQQRegClass.contains(DestReg) && 4595 AArch64::QQQRegClass.contains(SrcReg)) { 4596 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 4597 AArch64::qsub2}; 4598 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4599 Indices); 4600 return; 4601 } 4602 4603 // Copy a QQ register pair by copying the individual sub-registers. 4604 if (AArch64::QQRegClass.contains(DestReg) && 4605 AArch64::QQRegClass.contains(SrcReg)) { 4606 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 4607 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 4608 Indices); 4609 return; 4610 } 4611 4612 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 4613 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 4614 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 4615 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 4616 AArch64::XZR, Indices); 4617 return; 4618 } 4619 4620 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 4621 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 4622 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 4623 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 4624 AArch64::WZR, Indices); 4625 return; 4626 } 4627 4628 if (AArch64::FPR128RegClass.contains(DestReg) && 4629 AArch64::FPR128RegClass.contains(SrcReg)) { 4630 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) 4631 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 4632 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 4633 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 4634 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 4635 else if (Subtarget.hasNEON()) 4636 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 4637 .addReg(SrcReg) 4638 .addReg(SrcReg, getKillRegState(KillSrc)); 4639 else { 4640 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 4641 .addReg(AArch64::SP, RegState::Define) 4642 .addReg(SrcReg, getKillRegState(KillSrc)) 4643 .addReg(AArch64::SP) 4644 .addImm(-16); 4645 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 4646 .addReg(AArch64::SP, RegState::Define) 4647 .addReg(DestReg, RegState::Define) 4648 .addReg(AArch64::SP) 4649 .addImm(16); 4650 } 4651 return; 4652 } 4653 4654 if (AArch64::FPR64RegClass.contains(DestReg) && 4655 AArch64::FPR64RegClass.contains(SrcReg)) { 4656 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 4657 .addReg(SrcReg, getKillRegState(KillSrc)); 4658 return; 4659 } 4660 4661 if (AArch64::FPR32RegClass.contains(DestReg) && 4662 AArch64::FPR32RegClass.contains(SrcReg)) { 4663 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4664 .addReg(SrcReg, getKillRegState(KillSrc)); 4665 return; 4666 } 4667 4668 if (AArch64::FPR16RegClass.contains(DestReg) && 4669 AArch64::FPR16RegClass.contains(SrcReg)) { 4670 DestReg = 4671 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 4672 SrcReg = 4673 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 4674 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4675 .addReg(SrcReg, getKillRegState(KillSrc)); 4676 return; 4677 } 4678 4679 if (AArch64::FPR8RegClass.contains(DestReg) && 4680 AArch64::FPR8RegClass.contains(SrcReg)) { 4681 DestReg = 4682 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 4683 SrcReg = 4684 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 4685 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 4686 .addReg(SrcReg, getKillRegState(KillSrc)); 4687 return; 4688 } 4689 4690 // Copies between GPR64 and FPR64. 4691 if (AArch64::FPR64RegClass.contains(DestReg) && 4692 AArch64::GPR64RegClass.contains(SrcReg)) { 4693 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 4694 .addReg(SrcReg, getKillRegState(KillSrc)); 4695 return; 4696 } 4697 if (AArch64::GPR64RegClass.contains(DestReg) && 4698 AArch64::FPR64RegClass.contains(SrcReg)) { 4699 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 4700 .addReg(SrcReg, getKillRegState(KillSrc)); 4701 return; 4702 } 4703 // Copies between GPR32 and FPR32. 4704 if (AArch64::FPR32RegClass.contains(DestReg) && 4705 AArch64::GPR32RegClass.contains(SrcReg)) { 4706 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 4707 .addReg(SrcReg, getKillRegState(KillSrc)); 4708 return; 4709 } 4710 if (AArch64::GPR32RegClass.contains(DestReg) && 4711 AArch64::FPR32RegClass.contains(SrcReg)) { 4712 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 4713 .addReg(SrcReg, getKillRegState(KillSrc)); 4714 return; 4715 } 4716 4717 if (DestReg == AArch64::NZCV) { 4718 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 4719 BuildMI(MBB, I, DL, get(AArch64::MSR)) 4720 .addImm(AArch64SysReg::NZCV) 4721 .addReg(SrcReg, getKillRegState(KillSrc)) 4722 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 4723 return; 4724 } 4725 4726 if (SrcReg == AArch64::NZCV) { 4727 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 4728 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 4729 .addImm(AArch64SysReg::NZCV) 4730 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 4731 return; 4732 } 4733 4734 #ifndef NDEBUG 4735 const TargetRegisterInfo &TRI = getRegisterInfo(); 4736 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 4737 << TRI.getRegAsmName(SrcReg) << "\n"; 4738 #endif 4739 llvm_unreachable("unimplemented reg-to-reg copy"); 4740 } 4741 4742 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 4743 MachineBasicBlock &MBB, 4744 MachineBasicBlock::iterator InsertBefore, 4745 const MCInstrDesc &MCID, 4746 Register SrcReg, bool IsKill, 4747 unsigned SubIdx0, unsigned SubIdx1, int FI, 4748 MachineMemOperand *MMO) { 4749 Register SrcReg0 = SrcReg; 4750 Register SrcReg1 = SrcReg; 4751 if (SrcReg.isPhysical()) { 4752 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 4753 SubIdx0 = 0; 4754 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 4755 SubIdx1 = 0; 4756 } 4757 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4758 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 4759 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 4760 .addFrameIndex(FI) 4761 .addImm(0) 4762 .addMemOperand(MMO); 4763 } 4764 4765 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 4766 MachineBasicBlock::iterator MBBI, 4767 Register SrcReg, bool isKill, int FI, 4768 const TargetRegisterClass *RC, 4769 const TargetRegisterInfo *TRI, 4770 Register VReg) const { 4771 MachineFunction &MF = *MBB.getParent(); 4772 MachineFrameInfo &MFI = MF.getFrameInfo(); 4773 4774 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4775 MachineMemOperand *MMO = 4776 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 4777 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4778 unsigned Opc = 0; 4779 bool Offset = true; 4780 MCRegister PNRReg = MCRegister::NoRegister; 4781 unsigned StackID = TargetStackID::Default; 4782 switch (TRI->getSpillSize(*RC)) { 4783 case 1: 4784 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4785 Opc = AArch64::STRBui; 4786 break; 4787 case 2: 4788 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4789 Opc = AArch64::STRHui; 4790 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4791 assert(Subtarget.hasSVEorSME() && 4792 "Unexpected register store without SVE store instructions"); 4793 Opc = AArch64::STR_PXI; 4794 StackID = TargetStackID::ScalableVector; 4795 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { 4796 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4797 "Unexpected register store without SVE2p1 or SME2"); 4798 if (SrcReg.isVirtual()) { 4799 auto NewSrcReg = 4800 MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass); 4801 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg) 4802 .addReg(SrcReg); 4803 SrcReg = NewSrcReg; 4804 } else 4805 SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0; 4806 Opc = AArch64::STR_PXI; 4807 StackID = TargetStackID::ScalableVector; 4808 } 4809 break; 4810 case 4: 4811 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4812 Opc = AArch64::STRWui; 4813 if (SrcReg.isVirtual()) 4814 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 4815 else 4816 assert(SrcReg != AArch64::WSP); 4817 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4818 Opc = AArch64::STRSui; 4819 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 4820 Opc = AArch64::STR_PPXI; 4821 StackID = TargetStackID::ScalableVector; 4822 } 4823 break; 4824 case 8: 4825 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4826 Opc = AArch64::STRXui; 4827 if (SrcReg.isVirtual()) 4828 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4829 else 4830 assert(SrcReg != AArch64::SP); 4831 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4832 Opc = AArch64::STRDui; 4833 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4834 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4835 get(AArch64::STPWi), SrcReg, isKill, 4836 AArch64::sube32, AArch64::subo32, FI, MMO); 4837 return; 4838 } 4839 break; 4840 case 16: 4841 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4842 Opc = AArch64::STRQui; 4843 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4844 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4845 Opc = AArch64::ST1Twov1d; 4846 Offset = false; 4847 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4848 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 4849 get(AArch64::STPXi), SrcReg, isKill, 4850 AArch64::sube64, AArch64::subo64, FI, MMO); 4851 return; 4852 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4853 assert(Subtarget.hasSVEorSME() && 4854 "Unexpected register store without SVE store instructions"); 4855 Opc = AArch64::STR_ZXI; 4856 StackID = TargetStackID::ScalableVector; 4857 } 4858 break; 4859 case 24: 4860 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4861 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4862 Opc = AArch64::ST1Threev1d; 4863 Offset = false; 4864 } 4865 break; 4866 case 32: 4867 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4868 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4869 Opc = AArch64::ST1Fourv1d; 4870 Offset = false; 4871 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4872 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4873 Opc = AArch64::ST1Twov2d; 4874 Offset = false; 4875 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 4876 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4877 assert(Subtarget.hasSVEorSME() && 4878 "Unexpected register store without SVE store instructions"); 4879 Opc = AArch64::STR_ZZXI; 4880 StackID = TargetStackID::ScalableVector; 4881 } 4882 break; 4883 case 48: 4884 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4885 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4886 Opc = AArch64::ST1Threev2d; 4887 Offset = false; 4888 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4889 assert(Subtarget.hasSVEorSME() && 4890 "Unexpected register store without SVE store instructions"); 4891 Opc = AArch64::STR_ZZZXI; 4892 StackID = TargetStackID::ScalableVector; 4893 } 4894 break; 4895 case 64: 4896 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4897 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 4898 Opc = AArch64::ST1Fourv2d; 4899 Offset = false; 4900 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 4901 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 4902 assert(Subtarget.hasSVEorSME() && 4903 "Unexpected register store without SVE store instructions"); 4904 Opc = AArch64::STR_ZZZZXI; 4905 StackID = TargetStackID::ScalableVector; 4906 } 4907 break; 4908 } 4909 assert(Opc && "Unknown register class"); 4910 MFI.setStackID(FI, StackID); 4911 4912 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4913 .addReg(SrcReg, getKillRegState(isKill)) 4914 .addFrameIndex(FI); 4915 4916 if (Offset) 4917 MI.addImm(0); 4918 if (PNRReg.isValid()) 4919 MI.addDef(PNRReg, RegState::Implicit); 4920 MI.addMemOperand(MMO); 4921 } 4922 4923 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 4924 MachineBasicBlock &MBB, 4925 MachineBasicBlock::iterator InsertBefore, 4926 const MCInstrDesc &MCID, 4927 Register DestReg, unsigned SubIdx0, 4928 unsigned SubIdx1, int FI, 4929 MachineMemOperand *MMO) { 4930 Register DestReg0 = DestReg; 4931 Register DestReg1 = DestReg; 4932 bool IsUndef = true; 4933 if (DestReg.isPhysical()) { 4934 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 4935 SubIdx0 = 0; 4936 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 4937 SubIdx1 = 0; 4938 IsUndef = false; 4939 } 4940 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4941 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 4942 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 4943 .addFrameIndex(FI) 4944 .addImm(0) 4945 .addMemOperand(MMO); 4946 } 4947 4948 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 4949 MachineBasicBlock::iterator MBBI, 4950 Register DestReg, int FI, 4951 const TargetRegisterClass *RC, 4952 const TargetRegisterInfo *TRI, 4953 Register VReg) const { 4954 MachineFunction &MF = *MBB.getParent(); 4955 MachineFrameInfo &MFI = MF.getFrameInfo(); 4956 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4957 MachineMemOperand *MMO = 4958 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 4959 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4960 4961 unsigned Opc = 0; 4962 bool Offset = true; 4963 unsigned StackID = TargetStackID::Default; 4964 Register PNRReg = MCRegister::NoRegister; 4965 switch (TRI->getSpillSize(*RC)) { 4966 case 1: 4967 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4968 Opc = AArch64::LDRBui; 4969 break; 4970 case 2: 4971 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4972 Opc = AArch64::LDRHui; 4973 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4974 assert(Subtarget.hasSVEorSME() && 4975 "Unexpected register load without SVE load instructions"); 4976 Opc = AArch64::LDR_PXI; 4977 StackID = TargetStackID::ScalableVector; 4978 } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) { 4979 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) && 4980 "Unexpected register load without SVE2p1 or SME2"); 4981 PNRReg = DestReg; 4982 if (DestReg.isVirtual()) 4983 DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass); 4984 else 4985 DestReg = (DestReg - AArch64::PN0) + AArch64::P0; 4986 Opc = AArch64::LDR_PXI; 4987 StackID = TargetStackID::ScalableVector; 4988 } 4989 break; 4990 case 4: 4991 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4992 Opc = AArch64::LDRWui; 4993 if (DestReg.isVirtual()) 4994 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 4995 else 4996 assert(DestReg != AArch64::WSP); 4997 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4998 Opc = AArch64::LDRSui; 4999 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) { 5000 Opc = AArch64::LDR_PPXI; 5001 StackID = TargetStackID::ScalableVector; 5002 } 5003 break; 5004 case 8: 5005 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 5006 Opc = AArch64::LDRXui; 5007 if (DestReg.isVirtual()) 5008 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 5009 else 5010 assert(DestReg != AArch64::SP); 5011 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 5012 Opc = AArch64::LDRDui; 5013 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 5014 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5015 get(AArch64::LDPWi), DestReg, AArch64::sube32, 5016 AArch64::subo32, FI, MMO); 5017 return; 5018 } 5019 break; 5020 case 16: 5021 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 5022 Opc = AArch64::LDRQui; 5023 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 5024 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5025 Opc = AArch64::LD1Twov1d; 5026 Offset = false; 5027 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 5028 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 5029 get(AArch64::LDPXi), DestReg, AArch64::sube64, 5030 AArch64::subo64, FI, MMO); 5031 return; 5032 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 5033 assert(Subtarget.hasSVEorSME() && 5034 "Unexpected register load without SVE load instructions"); 5035 Opc = AArch64::LDR_ZXI; 5036 StackID = TargetStackID::ScalableVector; 5037 } 5038 break; 5039 case 24: 5040 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 5041 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5042 Opc = AArch64::LD1Threev1d; 5043 Offset = false; 5044 } 5045 break; 5046 case 32: 5047 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 5048 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5049 Opc = AArch64::LD1Fourv1d; 5050 Offset = false; 5051 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 5052 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5053 Opc = AArch64::LD1Twov2d; 5054 Offset = false; 5055 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || 5056 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5057 assert(Subtarget.hasSVEorSME() && 5058 "Unexpected register load without SVE load instructions"); 5059 Opc = AArch64::LDR_ZZXI; 5060 StackID = TargetStackID::ScalableVector; 5061 } 5062 break; 5063 case 48: 5064 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 5065 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5066 Opc = AArch64::LD1Threev2d; 5067 Offset = false; 5068 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 5069 assert(Subtarget.hasSVEorSME() && 5070 "Unexpected register load without SVE load instructions"); 5071 Opc = AArch64::LDR_ZZZXI; 5072 StackID = TargetStackID::ScalableVector; 5073 } 5074 break; 5075 case 64: 5076 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 5077 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 5078 Opc = AArch64::LD1Fourv2d; 5079 Offset = false; 5080 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || 5081 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { 5082 assert(Subtarget.hasSVEorSME() && 5083 "Unexpected register load without SVE load instructions"); 5084 Opc = AArch64::LDR_ZZZZXI; 5085 StackID = TargetStackID::ScalableVector; 5086 } 5087 break; 5088 } 5089 5090 assert(Opc && "Unknown register class"); 5091 MFI.setStackID(FI, StackID); 5092 5093 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 5094 .addReg(DestReg, getDefRegState(true)) 5095 .addFrameIndex(FI); 5096 if (Offset) 5097 MI.addImm(0); 5098 if (PNRReg.isValid() && !PNRReg.isVirtual()) 5099 MI.addDef(PNRReg, RegState::Implicit); 5100 MI.addMemOperand(MMO); 5101 5102 if (PNRReg.isValid() && PNRReg.isVirtual()) 5103 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg) 5104 .addReg(DestReg); 5105 } 5106 5107 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 5108 const MachineInstr &UseMI, 5109 const TargetRegisterInfo *TRI) { 5110 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 5111 UseMI.getIterator()), 5112 [TRI](const MachineInstr &I) { 5113 return I.modifiesRegister(AArch64::NZCV, TRI) || 5114 I.readsRegister(AArch64::NZCV, TRI); 5115 }); 5116 } 5117 5118 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5119 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 5120 // The smallest scalable element supported by scaled SVE addressing 5121 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5122 // byte offset must always be a multiple of 2. 5123 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5124 5125 // VGSized offsets are divided by '2', because the VG register is the 5126 // the number of 64bit granules as opposed to 128bit vector chunks, 5127 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 5128 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 5129 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 5130 ByteSized = Offset.getFixed(); 5131 VGSized = Offset.getScalable() / 2; 5132 } 5133 5134 /// Returns the offset in parts to which this frame offset can be 5135 /// decomposed for the purpose of describing a frame offset. 5136 /// For non-scalable offsets this is simply its byte size. 5137 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5138 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 5139 int64_t &NumDataVectors) { 5140 // The smallest scalable element supported by scaled SVE addressing 5141 // modes are predicates, which are 2 scalable bytes in size. So the scalable 5142 // byte offset must always be a multiple of 2. 5143 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 5144 5145 NumBytes = Offset.getFixed(); 5146 NumDataVectors = 0; 5147 NumPredicateVectors = Offset.getScalable() / 2; 5148 // This method is used to get the offsets to adjust the frame offset. 5149 // If the function requires ADDPL to be used and needs more than two ADDPL 5150 // instructions, part of the offset is folded into NumDataVectors so that it 5151 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 5152 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 5153 NumPredicateVectors > 62) { 5154 NumDataVectors = NumPredicateVectors / 8; 5155 NumPredicateVectors -= NumDataVectors * 8; 5156 } 5157 } 5158 5159 // Convenience function to create a DWARF expression for 5160 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 5161 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 5162 int NumVGScaledBytes, unsigned VG, 5163 llvm::raw_string_ostream &Comment) { 5164 uint8_t buffer[16]; 5165 5166 if (NumBytes) { 5167 Expr.push_back(dwarf::DW_OP_consts); 5168 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 5169 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5170 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 5171 } 5172 5173 if (NumVGScaledBytes) { 5174 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 5175 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 5176 5177 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 5178 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 5179 Expr.push_back(0); 5180 5181 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 5182 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 5183 5184 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 5185 << std::abs(NumVGScaledBytes) << " * VG"; 5186 } 5187 } 5188 5189 // Creates an MCCFIInstruction: 5190 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 5191 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 5192 unsigned Reg, 5193 const StackOffset &Offset) { 5194 int64_t NumBytes, NumVGScaledBytes; 5195 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 5196 NumVGScaledBytes); 5197 std::string CommentBuffer; 5198 llvm::raw_string_ostream Comment(CommentBuffer); 5199 5200 if (Reg == AArch64::SP) 5201 Comment << "sp"; 5202 else if (Reg == AArch64::FP) 5203 Comment << "fp"; 5204 else 5205 Comment << printReg(Reg, &TRI); 5206 5207 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 5208 SmallString<64> Expr; 5209 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5210 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 5211 Expr.push_back(0); 5212 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 5213 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5214 5215 // Wrap this into DW_CFA_def_cfa. 5216 SmallString<64> DefCfaExpr; 5217 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 5218 uint8_t buffer[16]; 5219 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 5220 DefCfaExpr.append(Expr.str()); 5221 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 5222 Comment.str()); 5223 } 5224 5225 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 5226 unsigned FrameReg, unsigned Reg, 5227 const StackOffset &Offset, 5228 bool LastAdjustmentWasScalable) { 5229 if (Offset.getScalable()) 5230 return createDefCFAExpression(TRI, Reg, Offset); 5231 5232 if (FrameReg == Reg && !LastAdjustmentWasScalable) 5233 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 5234 5235 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5236 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 5237 } 5238 5239 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 5240 unsigned Reg, 5241 const StackOffset &OffsetFromDefCFA) { 5242 int64_t NumBytes, NumVGScaledBytes; 5243 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 5244 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 5245 5246 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 5247 5248 // Non-scalable offsets can use DW_CFA_offset directly. 5249 if (!NumVGScaledBytes) 5250 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 5251 5252 std::string CommentBuffer; 5253 llvm::raw_string_ostream Comment(CommentBuffer); 5254 Comment << printReg(Reg, &TRI) << " @ cfa"; 5255 5256 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 5257 SmallString<64> OffsetExpr; 5258 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 5259 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 5260 5261 // Wrap this into DW_CFA_expression 5262 SmallString<64> CfaExpr; 5263 CfaExpr.push_back(dwarf::DW_CFA_expression); 5264 uint8_t buffer[16]; 5265 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 5266 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 5267 CfaExpr.append(OffsetExpr.str()); 5268 5269 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 5270 Comment.str()); 5271 } 5272 5273 // Helper function to emit a frame offset adjustment from a given 5274 // pointer (SrcReg), stored into DestReg. This function is explicit 5275 // in that it requires the opcode. 5276 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 5277 MachineBasicBlock::iterator MBBI, 5278 const DebugLoc &DL, unsigned DestReg, 5279 unsigned SrcReg, int64_t Offset, unsigned Opc, 5280 const TargetInstrInfo *TII, 5281 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 5282 bool *HasWinCFI, bool EmitCFAOffset, 5283 StackOffset CFAOffset, unsigned FrameReg) { 5284 int Sign = 1; 5285 unsigned MaxEncoding, ShiftSize; 5286 switch (Opc) { 5287 case AArch64::ADDXri: 5288 case AArch64::ADDSXri: 5289 case AArch64::SUBXri: 5290 case AArch64::SUBSXri: 5291 MaxEncoding = 0xfff; 5292 ShiftSize = 12; 5293 break; 5294 case AArch64::ADDVL_XXI: 5295 case AArch64::ADDPL_XXI: 5296 case AArch64::ADDSVL_XXI: 5297 case AArch64::ADDSPL_XXI: 5298 MaxEncoding = 31; 5299 ShiftSize = 0; 5300 if (Offset < 0) { 5301 MaxEncoding = 32; 5302 Sign = -1; 5303 Offset = -Offset; 5304 } 5305 break; 5306 default: 5307 llvm_unreachable("Unsupported opcode"); 5308 } 5309 5310 // `Offset` can be in bytes or in "scalable bytes". 5311 int VScale = 1; 5312 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 5313 VScale = 16; 5314 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 5315 VScale = 2; 5316 5317 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 5318 // scratch register. If DestReg is a virtual register, use it as the 5319 // scratch register; otherwise, create a new virtual register (to be 5320 // replaced by the scavenger at the end of PEI). That case can be optimized 5321 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 5322 // register can be loaded with offset%8 and the add/sub can use an extending 5323 // instruction with LSL#3. 5324 // Currently the function handles any offsets but generates a poor sequence 5325 // of code. 5326 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 5327 5328 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 5329 Register TmpReg = DestReg; 5330 if (TmpReg == AArch64::XZR) 5331 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 5332 &AArch64::GPR64RegClass); 5333 do { 5334 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 5335 unsigned LocalShiftSize = 0; 5336 if (ThisVal > MaxEncoding) { 5337 ThisVal = ThisVal >> ShiftSize; 5338 LocalShiftSize = ShiftSize; 5339 } 5340 assert((ThisVal >> ShiftSize) <= MaxEncoding && 5341 "Encoding cannot handle value that big"); 5342 5343 Offset -= ThisVal << LocalShiftSize; 5344 if (Offset == 0) 5345 TmpReg = DestReg; 5346 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 5347 .addReg(SrcReg) 5348 .addImm(Sign * (int)ThisVal); 5349 if (ShiftSize) 5350 MBI = MBI.addImm( 5351 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 5352 MBI = MBI.setMIFlag(Flag); 5353 5354 auto Change = 5355 VScale == 1 5356 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 5357 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 5358 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 5359 CFAOffset += Change; 5360 else 5361 CFAOffset -= Change; 5362 if (EmitCFAOffset && DestReg == TmpReg) { 5363 MachineFunction &MF = *MBB.getParent(); 5364 const TargetSubtargetInfo &STI = MF.getSubtarget(); 5365 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 5366 5367 unsigned CFIIndex = MF.addFrameInst( 5368 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 5369 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 5370 .addCFIIndex(CFIIndex) 5371 .setMIFlags(Flag); 5372 } 5373 5374 if (NeedsWinCFI) { 5375 assert(Sign == 1 && "SEH directives should always have a positive sign"); 5376 int Imm = (int)(ThisVal << LocalShiftSize); 5377 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 5378 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 5379 if (HasWinCFI) 5380 *HasWinCFI = true; 5381 if (Imm == 0) 5382 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 5383 else 5384 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 5385 .addImm(Imm) 5386 .setMIFlag(Flag); 5387 assert(Offset == 0 && "Expected remaining offset to be zero to " 5388 "emit a single SEH directive"); 5389 } else if (DestReg == AArch64::SP) { 5390 if (HasWinCFI) 5391 *HasWinCFI = true; 5392 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 5393 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 5394 .addImm(Imm) 5395 .setMIFlag(Flag); 5396 } 5397 } 5398 5399 SrcReg = TmpReg; 5400 } while (Offset); 5401 } 5402 5403 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 5404 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 5405 unsigned DestReg, unsigned SrcReg, 5406 StackOffset Offset, const TargetInstrInfo *TII, 5407 MachineInstr::MIFlag Flag, bool SetNZCV, 5408 bool NeedsWinCFI, bool *HasWinCFI, 5409 bool EmitCFAOffset, StackOffset CFAOffset, 5410 unsigned FrameReg) { 5411 // If a function is marked as arm_locally_streaming, then the runtime value of 5412 // vscale in the prologue/epilogue is different the runtime value of vscale 5413 // in the function's body. To avoid having to consider multiple vscales, 5414 // we can use `addsvl` to allocate any scalable stack-slots, which under 5415 // most circumstances will be only locals, not callee-save slots. 5416 const Function &F = MBB.getParent()->getFunction(); 5417 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 5418 5419 int64_t Bytes, NumPredicateVectors, NumDataVectors; 5420 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 5421 Offset, Bytes, NumPredicateVectors, NumDataVectors); 5422 5423 // First emit non-scalable frame offsets, or a simple 'mov'. 5424 if (Bytes || (!Offset && SrcReg != DestReg)) { 5425 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 5426 "SP increment/decrement not 8-byte aligned"); 5427 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 5428 if (Bytes < 0) { 5429 Bytes = -Bytes; 5430 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 5431 } 5432 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 5433 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 5434 FrameReg); 5435 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 5436 ? StackOffset::getFixed(-Bytes) 5437 : StackOffset::getFixed(Bytes); 5438 SrcReg = DestReg; 5439 FrameReg = DestReg; 5440 } 5441 5442 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 5443 "SetNZCV not supported with SVE vectors"); 5444 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 5445 "WinCFI not supported with SVE vectors"); 5446 5447 if (NumDataVectors) { 5448 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 5449 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 5450 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5451 CFAOffset, FrameReg); 5452 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 5453 SrcReg = DestReg; 5454 } 5455 5456 if (NumPredicateVectors) { 5457 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 5458 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 5459 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 5460 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 5461 CFAOffset, FrameReg); 5462 } 5463 } 5464 5465 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 5466 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 5467 MachineBasicBlock::iterator InsertPt, int FrameIndex, 5468 LiveIntervals *LIS, VirtRegMap *VRM) const { 5469 // This is a bit of a hack. Consider this instruction: 5470 // 5471 // %0 = COPY %sp; GPR64all:%0 5472 // 5473 // We explicitly chose GPR64all for the virtual register so such a copy might 5474 // be eliminated by RegisterCoalescer. However, that may not be possible, and 5475 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 5476 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 5477 // 5478 // To prevent that, we are going to constrain the %0 register class here. 5479 if (MI.isFullCopy()) { 5480 Register DstReg = MI.getOperand(0).getReg(); 5481 Register SrcReg = MI.getOperand(1).getReg(); 5482 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 5483 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 5484 return nullptr; 5485 } 5486 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 5487 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 5488 return nullptr; 5489 } 5490 // Nothing can folded with copy from/to NZCV. 5491 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 5492 return nullptr; 5493 } 5494 5495 // Handle the case where a copy is being spilled or filled but the source 5496 // and destination register class don't match. For example: 5497 // 5498 // %0 = COPY %xzr; GPR64common:%0 5499 // 5500 // In this case we can still safely fold away the COPY and generate the 5501 // following spill code: 5502 // 5503 // STRXui %xzr, %stack.0 5504 // 5505 // This also eliminates spilled cross register class COPYs (e.g. between x and 5506 // d regs) of the same size. For example: 5507 // 5508 // %0 = COPY %1; GPR64:%0, FPR64:%1 5509 // 5510 // will be filled as 5511 // 5512 // LDRDui %0, fi<#0> 5513 // 5514 // instead of 5515 // 5516 // LDRXui %Temp, fi<#0> 5517 // %0 = FMOV %Temp 5518 // 5519 if (MI.isCopy() && Ops.size() == 1 && 5520 // Make sure we're only folding the explicit COPY defs/uses. 5521 (Ops[0] == 0 || Ops[0] == 1)) { 5522 bool IsSpill = Ops[0] == 0; 5523 bool IsFill = !IsSpill; 5524 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 5525 const MachineRegisterInfo &MRI = MF.getRegInfo(); 5526 MachineBasicBlock &MBB = *MI.getParent(); 5527 const MachineOperand &DstMO = MI.getOperand(0); 5528 const MachineOperand &SrcMO = MI.getOperand(1); 5529 Register DstReg = DstMO.getReg(); 5530 Register SrcReg = SrcMO.getReg(); 5531 // This is slightly expensive to compute for physical regs since 5532 // getMinimalPhysRegClass is slow. 5533 auto getRegClass = [&](unsigned Reg) { 5534 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 5535 : TRI.getMinimalPhysRegClass(Reg); 5536 }; 5537 5538 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 5539 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 5540 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 5541 "Mismatched register size in non subreg COPY"); 5542 if (IsSpill) 5543 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 5544 getRegClass(SrcReg), &TRI, Register()); 5545 else 5546 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 5547 getRegClass(DstReg), &TRI, Register()); 5548 return &*--InsertPt; 5549 } 5550 5551 // Handle cases like spilling def of: 5552 // 5553 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 5554 // 5555 // where the physical register source can be widened and stored to the full 5556 // virtual reg destination stack slot, in this case producing: 5557 // 5558 // STRXui %xzr, %stack.0 5559 // 5560 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR && 5561 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) { 5562 assert(SrcMO.getSubReg() == 0 && 5563 "Unexpected subreg on physical register"); 5564 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), 5565 FrameIndex, &AArch64::GPR64RegClass, &TRI, 5566 Register()); 5567 return &*--InsertPt; 5568 } 5569 5570 // Handle cases like filling use of: 5571 // 5572 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 5573 // 5574 // where we can load the full virtual reg source stack slot, into the subreg 5575 // destination, in this case producing: 5576 // 5577 // LDRWui %0:sub_32<def,read-undef>, %stack.0 5578 // 5579 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 5580 const TargetRegisterClass *FillRC; 5581 switch (DstMO.getSubReg()) { 5582 default: 5583 FillRC = nullptr; 5584 break; 5585 case AArch64::sub_32: 5586 FillRC = &AArch64::GPR32RegClass; 5587 break; 5588 case AArch64::ssub: 5589 FillRC = &AArch64::FPR32RegClass; 5590 break; 5591 case AArch64::dsub: 5592 FillRC = &AArch64::FPR64RegClass; 5593 break; 5594 } 5595 5596 if (FillRC) { 5597 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 5598 TRI.getRegSizeInBits(*FillRC) && 5599 "Mismatched regclass size on folded subreg COPY"); 5600 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 5601 Register()); 5602 MachineInstr &LoadMI = *--InsertPt; 5603 MachineOperand &LoadDst = LoadMI.getOperand(0); 5604 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 5605 LoadDst.setSubReg(DstMO.getSubReg()); 5606 LoadDst.setIsUndef(); 5607 return &LoadMI; 5608 } 5609 } 5610 } 5611 5612 // Cannot fold. 5613 return nullptr; 5614 } 5615 5616 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 5617 StackOffset &SOffset, 5618 bool *OutUseUnscaledOp, 5619 unsigned *OutUnscaledOp, 5620 int64_t *EmittableOffset) { 5621 // Set output values in case of early exit. 5622 if (EmittableOffset) 5623 *EmittableOffset = 0; 5624 if (OutUseUnscaledOp) 5625 *OutUseUnscaledOp = false; 5626 if (OutUnscaledOp) 5627 *OutUnscaledOp = 0; 5628 5629 // Exit early for structured vector spills/fills as they can't take an 5630 // immediate offset. 5631 switch (MI.getOpcode()) { 5632 default: 5633 break; 5634 case AArch64::LD1Rv1d: 5635 case AArch64::LD1Rv2s: 5636 case AArch64::LD1Rv2d: 5637 case AArch64::LD1Rv4h: 5638 case AArch64::LD1Rv4s: 5639 case AArch64::LD1Rv8b: 5640 case AArch64::LD1Rv8h: 5641 case AArch64::LD1Rv16b: 5642 case AArch64::LD1Twov2d: 5643 case AArch64::LD1Threev2d: 5644 case AArch64::LD1Fourv2d: 5645 case AArch64::LD1Twov1d: 5646 case AArch64::LD1Threev1d: 5647 case AArch64::LD1Fourv1d: 5648 case AArch64::ST1Twov2d: 5649 case AArch64::ST1Threev2d: 5650 case AArch64::ST1Fourv2d: 5651 case AArch64::ST1Twov1d: 5652 case AArch64::ST1Threev1d: 5653 case AArch64::ST1Fourv1d: 5654 case AArch64::ST1i8: 5655 case AArch64::ST1i16: 5656 case AArch64::ST1i32: 5657 case AArch64::ST1i64: 5658 case AArch64::IRG: 5659 case AArch64::IRGstack: 5660 case AArch64::STGloop: 5661 case AArch64::STZGloop: 5662 return AArch64FrameOffsetCannotUpdate; 5663 } 5664 5665 // Get the min/max offset and the scale. 5666 TypeSize ScaleValue(0U, false), Width(0U, false); 5667 int64_t MinOff, MaxOff; 5668 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 5669 MaxOff)) 5670 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5671 5672 // Construct the complete offset. 5673 bool IsMulVL = ScaleValue.isScalable(); 5674 unsigned Scale = ScaleValue.getKnownMinValue(); 5675 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 5676 5677 const MachineOperand &ImmOpnd = 5678 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 5679 Offset += ImmOpnd.getImm() * Scale; 5680 5681 // If the offset doesn't match the scale, we rewrite the instruction to 5682 // use the unscaled instruction instead. Likewise, if we have a negative 5683 // offset and there is an unscaled op to use. 5684 std::optional<unsigned> UnscaledOp = 5685 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 5686 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 5687 if (useUnscaledOp && 5688 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 5689 MaxOff)) 5690 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 5691 5692 Scale = ScaleValue.getKnownMinValue(); 5693 assert(IsMulVL == ScaleValue.isScalable() && 5694 "Unscaled opcode has different value for scalable"); 5695 5696 int64_t Remainder = Offset % Scale; 5697 assert(!(Remainder && useUnscaledOp) && 5698 "Cannot have remainder when using unscaled op"); 5699 5700 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 5701 int64_t NewOffset = Offset / Scale; 5702 if (MinOff <= NewOffset && NewOffset <= MaxOff) 5703 Offset = Remainder; 5704 else { 5705 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 5706 Offset = Offset - NewOffset * Scale; 5707 } 5708 5709 if (EmittableOffset) 5710 *EmittableOffset = NewOffset; 5711 if (OutUseUnscaledOp) 5712 *OutUseUnscaledOp = useUnscaledOp; 5713 if (OutUnscaledOp && UnscaledOp) 5714 *OutUnscaledOp = *UnscaledOp; 5715 5716 if (IsMulVL) 5717 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 5718 else 5719 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 5720 return AArch64FrameOffsetCanUpdate | 5721 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 5722 } 5723 5724 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 5725 unsigned FrameReg, StackOffset &Offset, 5726 const AArch64InstrInfo *TII) { 5727 unsigned Opcode = MI.getOpcode(); 5728 unsigned ImmIdx = FrameRegIdx + 1; 5729 5730 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 5731 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 5732 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 5733 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 5734 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 5735 MI.eraseFromParent(); 5736 Offset = StackOffset(); 5737 return true; 5738 } 5739 5740 int64_t NewOffset; 5741 unsigned UnscaledOp; 5742 bool UseUnscaledOp; 5743 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 5744 &UnscaledOp, &NewOffset); 5745 if (Status & AArch64FrameOffsetCanUpdate) { 5746 if (Status & AArch64FrameOffsetIsLegal) 5747 // Replace the FrameIndex with FrameReg. 5748 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 5749 if (UseUnscaledOp) 5750 MI.setDesc(TII->get(UnscaledOp)); 5751 5752 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 5753 return !Offset; 5754 } 5755 5756 return false; 5757 } 5758 5759 void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB, 5760 MachineBasicBlock::iterator MI) const { 5761 DebugLoc DL; 5762 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0); 5763 } 5764 5765 MCInst AArch64InstrInfo::getNop() const { 5766 return MCInstBuilder(AArch64::HINT).addImm(0); 5767 } 5768 5769 // AArch64 supports MachineCombiner. 5770 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 5771 5772 // True when Opc sets flag 5773 static bool isCombineInstrSettingFlag(unsigned Opc) { 5774 switch (Opc) { 5775 case AArch64::ADDSWrr: 5776 case AArch64::ADDSWri: 5777 case AArch64::ADDSXrr: 5778 case AArch64::ADDSXri: 5779 case AArch64::SUBSWrr: 5780 case AArch64::SUBSXrr: 5781 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5782 case AArch64::SUBSWri: 5783 case AArch64::SUBSXri: 5784 return true; 5785 default: 5786 break; 5787 } 5788 return false; 5789 } 5790 5791 // 32b Opcodes that can be combined with a MUL 5792 static bool isCombineInstrCandidate32(unsigned Opc) { 5793 switch (Opc) { 5794 case AArch64::ADDWrr: 5795 case AArch64::ADDWri: 5796 case AArch64::SUBWrr: 5797 case AArch64::ADDSWrr: 5798 case AArch64::ADDSWri: 5799 case AArch64::SUBSWrr: 5800 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5801 case AArch64::SUBWri: 5802 case AArch64::SUBSWri: 5803 return true; 5804 default: 5805 break; 5806 } 5807 return false; 5808 } 5809 5810 // 64b Opcodes that can be combined with a MUL 5811 static bool isCombineInstrCandidate64(unsigned Opc) { 5812 switch (Opc) { 5813 case AArch64::ADDXrr: 5814 case AArch64::ADDXri: 5815 case AArch64::SUBXrr: 5816 case AArch64::ADDSXrr: 5817 case AArch64::ADDSXri: 5818 case AArch64::SUBSXrr: 5819 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 5820 case AArch64::SUBXri: 5821 case AArch64::SUBSXri: 5822 case AArch64::ADDv8i8: 5823 case AArch64::ADDv16i8: 5824 case AArch64::ADDv4i16: 5825 case AArch64::ADDv8i16: 5826 case AArch64::ADDv2i32: 5827 case AArch64::ADDv4i32: 5828 case AArch64::SUBv8i8: 5829 case AArch64::SUBv16i8: 5830 case AArch64::SUBv4i16: 5831 case AArch64::SUBv8i16: 5832 case AArch64::SUBv2i32: 5833 case AArch64::SUBv4i32: 5834 return true; 5835 default: 5836 break; 5837 } 5838 return false; 5839 } 5840 5841 // FP Opcodes that can be combined with a FMUL. 5842 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 5843 switch (Inst.getOpcode()) { 5844 default: 5845 break; 5846 case AArch64::FADDHrr: 5847 case AArch64::FADDSrr: 5848 case AArch64::FADDDrr: 5849 case AArch64::FADDv4f16: 5850 case AArch64::FADDv8f16: 5851 case AArch64::FADDv2f32: 5852 case AArch64::FADDv2f64: 5853 case AArch64::FADDv4f32: 5854 case AArch64::FSUBHrr: 5855 case AArch64::FSUBSrr: 5856 case AArch64::FSUBDrr: 5857 case AArch64::FSUBv4f16: 5858 case AArch64::FSUBv8f16: 5859 case AArch64::FSUBv2f32: 5860 case AArch64::FSUBv2f64: 5861 case AArch64::FSUBv4f32: 5862 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 5863 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 5864 // the target options or if FADD/FSUB has the contract fast-math flag. 5865 return Options.UnsafeFPMath || 5866 Options.AllowFPOpFusion == FPOpFusion::Fast || 5867 Inst.getFlag(MachineInstr::FmContract); 5868 return true; 5869 } 5870 return false; 5871 } 5872 5873 // Opcodes that can be combined with a MUL 5874 static bool isCombineInstrCandidate(unsigned Opc) { 5875 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 5876 } 5877 5878 // 5879 // Utility routine that checks if \param MO is defined by an 5880 // \param CombineOpc instruction in the basic block \param MBB 5881 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 5882 unsigned CombineOpc, unsigned ZeroReg = 0, 5883 bool CheckZeroReg = false) { 5884 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5885 MachineInstr *MI = nullptr; 5886 5887 if (MO.isReg() && MO.getReg().isVirtual()) 5888 MI = MRI.getUniqueVRegDef(MO.getReg()); 5889 // And it needs to be in the trace (otherwise, it won't have a depth). 5890 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 5891 return false; 5892 // Must only used by the user we combine with. 5893 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 5894 return false; 5895 5896 if (CheckZeroReg) { 5897 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 5898 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 5899 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 5900 // The third input reg must be zero. 5901 if (MI->getOperand(3).getReg() != ZeroReg) 5902 return false; 5903 } 5904 5905 if (isCombineInstrSettingFlag(CombineOpc) && 5906 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 5907 return false; 5908 5909 return true; 5910 } 5911 5912 // 5913 // Is \param MO defined by an integer multiply and can be combined? 5914 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5915 unsigned MulOpc, unsigned ZeroReg) { 5916 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 5917 } 5918 5919 // 5920 // Is \param MO defined by a floating-point multiply and can be combined? 5921 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 5922 unsigned MulOpc) { 5923 return canCombine(MBB, MO, MulOpc); 5924 } 5925 5926 // TODO: There are many more machine instruction opcodes to match: 5927 // 1. Other data types (integer, vectors) 5928 // 2. Other math / logic operations (xor, or) 5929 // 3. Other forms of the same operation (intrinsics and other variants) 5930 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 5931 bool Invert) const { 5932 if (Invert) 5933 return false; 5934 switch (Inst.getOpcode()) { 5935 // == Floating-point types == 5936 // -- Floating-point instructions -- 5937 case AArch64::FADDHrr: 5938 case AArch64::FADDSrr: 5939 case AArch64::FADDDrr: 5940 case AArch64::FMULHrr: 5941 case AArch64::FMULSrr: 5942 case AArch64::FMULDrr: 5943 case AArch64::FMULX16: 5944 case AArch64::FMULX32: 5945 case AArch64::FMULX64: 5946 // -- Advanced SIMD instructions -- 5947 case AArch64::FADDv4f16: 5948 case AArch64::FADDv8f16: 5949 case AArch64::FADDv2f32: 5950 case AArch64::FADDv4f32: 5951 case AArch64::FADDv2f64: 5952 case AArch64::FMULv4f16: 5953 case AArch64::FMULv8f16: 5954 case AArch64::FMULv2f32: 5955 case AArch64::FMULv4f32: 5956 case AArch64::FMULv2f64: 5957 case AArch64::FMULXv4f16: 5958 case AArch64::FMULXv8f16: 5959 case AArch64::FMULXv2f32: 5960 case AArch64::FMULXv4f32: 5961 case AArch64::FMULXv2f64: 5962 // -- SVE instructions -- 5963 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 5964 // in the SVE instruction set (though there are predicated ones). 5965 case AArch64::FADD_ZZZ_H: 5966 case AArch64::FADD_ZZZ_S: 5967 case AArch64::FADD_ZZZ_D: 5968 case AArch64::FMUL_ZZZ_H: 5969 case AArch64::FMUL_ZZZ_S: 5970 case AArch64::FMUL_ZZZ_D: 5971 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 5972 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 5973 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 5974 5975 // == Integer types == 5976 // -- Base instructions -- 5977 // Opcodes MULWrr and MULXrr don't exist because 5978 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 5979 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 5980 // The machine-combiner does not support three-source-operands machine 5981 // instruction. So we cannot reassociate MULs. 5982 case AArch64::ADDWrr: 5983 case AArch64::ADDXrr: 5984 case AArch64::ANDWrr: 5985 case AArch64::ANDXrr: 5986 case AArch64::ORRWrr: 5987 case AArch64::ORRXrr: 5988 case AArch64::EORWrr: 5989 case AArch64::EORXrr: 5990 case AArch64::EONWrr: 5991 case AArch64::EONXrr: 5992 // -- Advanced SIMD instructions -- 5993 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 5994 // in the Advanced SIMD instruction set. 5995 case AArch64::ADDv8i8: 5996 case AArch64::ADDv16i8: 5997 case AArch64::ADDv4i16: 5998 case AArch64::ADDv8i16: 5999 case AArch64::ADDv2i32: 6000 case AArch64::ADDv4i32: 6001 case AArch64::ADDv1i64: 6002 case AArch64::ADDv2i64: 6003 case AArch64::MULv8i8: 6004 case AArch64::MULv16i8: 6005 case AArch64::MULv4i16: 6006 case AArch64::MULv8i16: 6007 case AArch64::MULv2i32: 6008 case AArch64::MULv4i32: 6009 case AArch64::ANDv8i8: 6010 case AArch64::ANDv16i8: 6011 case AArch64::ORRv8i8: 6012 case AArch64::ORRv16i8: 6013 case AArch64::EORv8i8: 6014 case AArch64::EORv16i8: 6015 // -- SVE instructions -- 6016 case AArch64::ADD_ZZZ_B: 6017 case AArch64::ADD_ZZZ_H: 6018 case AArch64::ADD_ZZZ_S: 6019 case AArch64::ADD_ZZZ_D: 6020 case AArch64::MUL_ZZZ_B: 6021 case AArch64::MUL_ZZZ_H: 6022 case AArch64::MUL_ZZZ_S: 6023 case AArch64::MUL_ZZZ_D: 6024 case AArch64::AND_ZZZ: 6025 case AArch64::ORR_ZZZ: 6026 case AArch64::EOR_ZZZ: 6027 return true; 6028 6029 default: 6030 return false; 6031 } 6032 } 6033 6034 /// Find instructions that can be turned into madd. 6035 static bool getMaddPatterns(MachineInstr &Root, 6036 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6037 unsigned Opc = Root.getOpcode(); 6038 MachineBasicBlock &MBB = *Root.getParent(); 6039 bool Found = false; 6040 6041 if (!isCombineInstrCandidate(Opc)) 6042 return false; 6043 if (isCombineInstrSettingFlag(Opc)) { 6044 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 6045 // When NZCV is live bail out. 6046 if (Cmp_NZCV == -1) 6047 return false; 6048 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 6049 // When opcode can't change bail out. 6050 // CHECKME: do we miss any cases for opcode conversion? 6051 if (NewOpc == Opc) 6052 return false; 6053 Opc = NewOpc; 6054 } 6055 6056 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 6057 MachineCombinerPattern Pattern) { 6058 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 6059 Patterns.push_back(Pattern); 6060 Found = true; 6061 } 6062 }; 6063 6064 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 6065 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 6066 Patterns.push_back(Pattern); 6067 Found = true; 6068 } 6069 }; 6070 6071 typedef MachineCombinerPattern MCP; 6072 6073 switch (Opc) { 6074 default: 6075 break; 6076 case AArch64::ADDWrr: 6077 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6078 "ADDWrr does not have register operands"); 6079 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 6080 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 6081 break; 6082 case AArch64::ADDXrr: 6083 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 6084 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 6085 break; 6086 case AArch64::SUBWrr: 6087 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 6088 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 6089 break; 6090 case AArch64::SUBXrr: 6091 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 6092 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 6093 break; 6094 case AArch64::ADDWri: 6095 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 6096 break; 6097 case AArch64::ADDXri: 6098 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 6099 break; 6100 case AArch64::SUBWri: 6101 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 6102 break; 6103 case AArch64::SUBXri: 6104 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 6105 break; 6106 case AArch64::ADDv8i8: 6107 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 6108 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 6109 break; 6110 case AArch64::ADDv16i8: 6111 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 6112 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 6113 break; 6114 case AArch64::ADDv4i16: 6115 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 6116 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 6117 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 6118 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 6119 break; 6120 case AArch64::ADDv8i16: 6121 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 6122 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 6123 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 6124 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 6125 break; 6126 case AArch64::ADDv2i32: 6127 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 6128 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 6129 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 6130 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 6131 break; 6132 case AArch64::ADDv4i32: 6133 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 6134 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 6135 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 6136 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 6137 break; 6138 case AArch64::SUBv8i8: 6139 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 6140 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 6141 break; 6142 case AArch64::SUBv16i8: 6143 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 6144 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 6145 break; 6146 case AArch64::SUBv4i16: 6147 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 6148 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 6149 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 6150 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 6151 break; 6152 case AArch64::SUBv8i16: 6153 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 6154 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 6155 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 6156 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 6157 break; 6158 case AArch64::SUBv2i32: 6159 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 6160 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 6161 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 6162 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 6163 break; 6164 case AArch64::SUBv4i32: 6165 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 6166 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 6167 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 6168 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 6169 break; 6170 } 6171 return Found; 6172 } 6173 /// Floating-Point Support 6174 6175 /// Find instructions that can be turned into madd. 6176 static bool getFMAPatterns(MachineInstr &Root, 6177 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6178 6179 if (!isCombineInstrCandidateFP(Root)) 6180 return false; 6181 6182 MachineBasicBlock &MBB = *Root.getParent(); 6183 bool Found = false; 6184 6185 auto Match = [&](int Opcode, int Operand, 6186 MachineCombinerPattern Pattern) -> bool { 6187 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 6188 Patterns.push_back(Pattern); 6189 return true; 6190 } 6191 return false; 6192 }; 6193 6194 typedef MachineCombinerPattern MCP; 6195 6196 switch (Root.getOpcode()) { 6197 default: 6198 assert(false && "Unsupported FP instruction in combiner\n"); 6199 break; 6200 case AArch64::FADDHrr: 6201 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6202 "FADDHrr does not have register operands"); 6203 6204 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 6205 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 6206 break; 6207 case AArch64::FADDSrr: 6208 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 6209 "FADDSrr does not have register operands"); 6210 6211 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 6212 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 6213 6214 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 6215 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 6216 break; 6217 case AArch64::FADDDrr: 6218 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 6219 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 6220 6221 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 6222 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 6223 break; 6224 case AArch64::FADDv4f16: 6225 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 6226 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 6227 6228 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 6229 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 6230 break; 6231 case AArch64::FADDv8f16: 6232 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 6233 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 6234 6235 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 6236 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 6237 break; 6238 case AArch64::FADDv2f32: 6239 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 6240 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 6241 6242 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 6243 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 6244 break; 6245 case AArch64::FADDv2f64: 6246 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 6247 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 6248 6249 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 6250 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 6251 break; 6252 case AArch64::FADDv4f32: 6253 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 6254 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 6255 6256 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 6257 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 6258 break; 6259 case AArch64::FSUBHrr: 6260 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 6261 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 6262 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 6263 break; 6264 case AArch64::FSUBSrr: 6265 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 6266 6267 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 6268 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 6269 6270 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 6271 break; 6272 case AArch64::FSUBDrr: 6273 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 6274 6275 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 6276 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 6277 6278 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 6279 break; 6280 case AArch64::FSUBv4f16: 6281 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 6282 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 6283 6284 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 6285 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 6286 break; 6287 case AArch64::FSUBv8f16: 6288 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 6289 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 6290 6291 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 6292 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 6293 break; 6294 case AArch64::FSUBv2f32: 6295 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 6296 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 6297 6298 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 6299 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 6300 break; 6301 case AArch64::FSUBv2f64: 6302 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 6303 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 6304 6305 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 6306 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 6307 break; 6308 case AArch64::FSUBv4f32: 6309 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 6310 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 6311 6312 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 6313 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 6314 break; 6315 } 6316 return Found; 6317 } 6318 6319 static bool getFMULPatterns(MachineInstr &Root, 6320 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6321 MachineBasicBlock &MBB = *Root.getParent(); 6322 bool Found = false; 6323 6324 auto Match = [&](unsigned Opcode, int Operand, 6325 MachineCombinerPattern Pattern) -> bool { 6326 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6327 MachineOperand &MO = Root.getOperand(Operand); 6328 MachineInstr *MI = nullptr; 6329 if (MO.isReg() && MO.getReg().isVirtual()) 6330 MI = MRI.getUniqueVRegDef(MO.getReg()); 6331 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 6332 if (MI && MI->getOpcode() == TargetOpcode::COPY && 6333 MI->getOperand(1).getReg().isVirtual()) 6334 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 6335 if (MI && MI->getOpcode() == Opcode) { 6336 Patterns.push_back(Pattern); 6337 return true; 6338 } 6339 return false; 6340 }; 6341 6342 typedef MachineCombinerPattern MCP; 6343 6344 switch (Root.getOpcode()) { 6345 default: 6346 return false; 6347 case AArch64::FMULv2f32: 6348 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 6349 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 6350 break; 6351 case AArch64::FMULv2f64: 6352 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 6353 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 6354 break; 6355 case AArch64::FMULv4f16: 6356 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 6357 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 6358 break; 6359 case AArch64::FMULv4f32: 6360 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 6361 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 6362 break; 6363 case AArch64::FMULv8f16: 6364 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 6365 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 6366 break; 6367 } 6368 6369 return Found; 6370 } 6371 6372 static bool getFNEGPatterns(MachineInstr &Root, 6373 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 6374 unsigned Opc = Root.getOpcode(); 6375 MachineBasicBlock &MBB = *Root.getParent(); 6376 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6377 6378 auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool { 6379 MachineOperand &MO = Root.getOperand(1); 6380 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 6381 if (MI != nullptr && (MI->getOpcode() == Opcode) && 6382 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 6383 Root.getFlag(MachineInstr::MIFlag::FmContract) && 6384 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 6385 MI->getFlag(MachineInstr::MIFlag::FmContract) && 6386 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 6387 Patterns.push_back(Pattern); 6388 return true; 6389 } 6390 return false; 6391 }; 6392 6393 switch (Opc) { 6394 default: 6395 break; 6396 case AArch64::FNEGDr: 6397 return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD); 6398 case AArch64::FNEGSr: 6399 return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD); 6400 } 6401 6402 return false; 6403 } 6404 6405 /// Return true when a code sequence can improve throughput. It 6406 /// should be called only for instructions in loops. 6407 /// \param Pattern - combiner pattern 6408 bool AArch64InstrInfo::isThroughputPattern( 6409 MachineCombinerPattern Pattern) const { 6410 switch (Pattern) { 6411 default: 6412 break; 6413 case MachineCombinerPattern::FMULADDH_OP1: 6414 case MachineCombinerPattern::FMULADDH_OP2: 6415 case MachineCombinerPattern::FMULSUBH_OP1: 6416 case MachineCombinerPattern::FMULSUBH_OP2: 6417 case MachineCombinerPattern::FMULADDS_OP1: 6418 case MachineCombinerPattern::FMULADDS_OP2: 6419 case MachineCombinerPattern::FMULSUBS_OP1: 6420 case MachineCombinerPattern::FMULSUBS_OP2: 6421 case MachineCombinerPattern::FMULADDD_OP1: 6422 case MachineCombinerPattern::FMULADDD_OP2: 6423 case MachineCombinerPattern::FMULSUBD_OP1: 6424 case MachineCombinerPattern::FMULSUBD_OP2: 6425 case MachineCombinerPattern::FNMULSUBH_OP1: 6426 case MachineCombinerPattern::FNMULSUBS_OP1: 6427 case MachineCombinerPattern::FNMULSUBD_OP1: 6428 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6429 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6430 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6431 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6432 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6433 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6434 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6435 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6436 case MachineCombinerPattern::FMLAv4f16_OP2: 6437 case MachineCombinerPattern::FMLAv4f16_OP1: 6438 case MachineCombinerPattern::FMLAv8f16_OP1: 6439 case MachineCombinerPattern::FMLAv8f16_OP2: 6440 case MachineCombinerPattern::FMLAv2f32_OP2: 6441 case MachineCombinerPattern::FMLAv2f32_OP1: 6442 case MachineCombinerPattern::FMLAv2f64_OP1: 6443 case MachineCombinerPattern::FMLAv2f64_OP2: 6444 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6445 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6446 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6447 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6448 case MachineCombinerPattern::FMLAv4f32_OP1: 6449 case MachineCombinerPattern::FMLAv4f32_OP2: 6450 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6451 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6452 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 6453 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6454 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 6455 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6456 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6457 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6458 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6459 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6460 case MachineCombinerPattern::FMLSv4f16_OP1: 6461 case MachineCombinerPattern::FMLSv4f16_OP2: 6462 case MachineCombinerPattern::FMLSv8f16_OP1: 6463 case MachineCombinerPattern::FMLSv8f16_OP2: 6464 case MachineCombinerPattern::FMLSv2f32_OP2: 6465 case MachineCombinerPattern::FMLSv2f64_OP2: 6466 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6467 case MachineCombinerPattern::FMLSv4f32_OP2: 6468 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 6469 case MachineCombinerPattern::FMULv2i32_indexed_OP2: 6470 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 6471 case MachineCombinerPattern::FMULv2i64_indexed_OP2: 6472 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 6473 case MachineCombinerPattern::FMULv4i16_indexed_OP2: 6474 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 6475 case MachineCombinerPattern::FMULv4i32_indexed_OP2: 6476 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 6477 case MachineCombinerPattern::FMULv8i16_indexed_OP2: 6478 case MachineCombinerPattern::MULADDv8i8_OP1: 6479 case MachineCombinerPattern::MULADDv8i8_OP2: 6480 case MachineCombinerPattern::MULADDv16i8_OP1: 6481 case MachineCombinerPattern::MULADDv16i8_OP2: 6482 case MachineCombinerPattern::MULADDv4i16_OP1: 6483 case MachineCombinerPattern::MULADDv4i16_OP2: 6484 case MachineCombinerPattern::MULADDv8i16_OP1: 6485 case MachineCombinerPattern::MULADDv8i16_OP2: 6486 case MachineCombinerPattern::MULADDv2i32_OP1: 6487 case MachineCombinerPattern::MULADDv2i32_OP2: 6488 case MachineCombinerPattern::MULADDv4i32_OP1: 6489 case MachineCombinerPattern::MULADDv4i32_OP2: 6490 case MachineCombinerPattern::MULSUBv8i8_OP1: 6491 case MachineCombinerPattern::MULSUBv8i8_OP2: 6492 case MachineCombinerPattern::MULSUBv16i8_OP1: 6493 case MachineCombinerPattern::MULSUBv16i8_OP2: 6494 case MachineCombinerPattern::MULSUBv4i16_OP1: 6495 case MachineCombinerPattern::MULSUBv4i16_OP2: 6496 case MachineCombinerPattern::MULSUBv8i16_OP1: 6497 case MachineCombinerPattern::MULSUBv8i16_OP2: 6498 case MachineCombinerPattern::MULSUBv2i32_OP1: 6499 case MachineCombinerPattern::MULSUBv2i32_OP2: 6500 case MachineCombinerPattern::MULSUBv4i32_OP1: 6501 case MachineCombinerPattern::MULSUBv4i32_OP2: 6502 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6503 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6504 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6505 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6506 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6507 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6508 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6509 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6510 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6511 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6512 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6513 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6514 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6515 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6516 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6517 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6518 return true; 6519 } // end switch (Pattern) 6520 return false; 6521 } 6522 6523 /// Find other MI combine patterns. 6524 static bool getMiscPatterns(MachineInstr &Root, 6525 SmallVectorImpl<MachineCombinerPattern> &Patterns) 6526 { 6527 // A - (B + C) ==> (A - B) - C or (A - C) - B 6528 unsigned Opc = Root.getOpcode(); 6529 MachineBasicBlock &MBB = *Root.getParent(); 6530 6531 switch (Opc) { 6532 case AArch64::SUBWrr: 6533 case AArch64::SUBSWrr: 6534 case AArch64::SUBXrr: 6535 case AArch64::SUBSXrr: 6536 // Found candidate root. 6537 break; 6538 default: 6539 return false; 6540 } 6541 6542 if (isCombineInstrSettingFlag(Opc) && 6543 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 6544 return false; 6545 6546 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 6547 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 6548 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 6549 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 6550 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); 6551 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); 6552 return true; 6553 } 6554 6555 return false; 6556 } 6557 6558 /// Return true when there is potentially a faster code sequence for an 6559 /// instruction chain ending in \p Root. All potential patterns are listed in 6560 /// the \p Pattern vector. Pattern should be sorted in priority order since the 6561 /// pattern evaluator stops checking as soon as it finds a faster sequence. 6562 6563 bool AArch64InstrInfo::getMachineCombinerPatterns( 6564 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 6565 bool DoRegPressureReduce) const { 6566 // Integer patterns 6567 if (getMaddPatterns(Root, Patterns)) 6568 return true; 6569 // Floating point patterns 6570 if (getFMULPatterns(Root, Patterns)) 6571 return true; 6572 if (getFMAPatterns(Root, Patterns)) 6573 return true; 6574 if (getFNEGPatterns(Root, Patterns)) 6575 return true; 6576 6577 // Other patterns 6578 if (getMiscPatterns(Root, Patterns)) 6579 return true; 6580 6581 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 6582 DoRegPressureReduce); 6583 } 6584 6585 enum class FMAInstKind { Default, Indexed, Accumulator }; 6586 /// genFusedMultiply - Generate fused multiply instructions. 6587 /// This function supports both integer and floating point instructions. 6588 /// A typical example: 6589 /// F|MUL I=A,B,0 6590 /// F|ADD R,I,C 6591 /// ==> F|MADD R,A,B,C 6592 /// \param MF Containing MachineFunction 6593 /// \param MRI Register information 6594 /// \param TII Target information 6595 /// \param Root is the F|ADD instruction 6596 /// \param [out] InsInstrs is a vector of machine instructions and will 6597 /// contain the generated madd instruction 6598 /// \param IdxMulOpd is index of operand in Root that is the result of 6599 /// the F|MUL. In the example above IdxMulOpd is 1. 6600 /// \param MaddOpc the opcode fo the f|madd instruction 6601 /// \param RC Register class of operands 6602 /// \param kind of fma instruction (addressing mode) to be generated 6603 /// \param ReplacedAddend is the result register from the instruction 6604 /// replacing the non-combined operand, if any. 6605 static MachineInstr * 6606 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 6607 const TargetInstrInfo *TII, MachineInstr &Root, 6608 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 6609 unsigned MaddOpc, const TargetRegisterClass *RC, 6610 FMAInstKind kind = FMAInstKind::Default, 6611 const Register *ReplacedAddend = nullptr) { 6612 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6613 6614 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 6615 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6616 Register ResultReg = Root.getOperand(0).getReg(); 6617 Register SrcReg0 = MUL->getOperand(1).getReg(); 6618 bool Src0IsKill = MUL->getOperand(1).isKill(); 6619 Register SrcReg1 = MUL->getOperand(2).getReg(); 6620 bool Src1IsKill = MUL->getOperand(2).isKill(); 6621 6622 Register SrcReg2; 6623 bool Src2IsKill; 6624 if (ReplacedAddend) { 6625 // If we just generated a new addend, we must be it's only use. 6626 SrcReg2 = *ReplacedAddend; 6627 Src2IsKill = true; 6628 } else { 6629 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 6630 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 6631 } 6632 6633 if (ResultReg.isVirtual()) 6634 MRI.constrainRegClass(ResultReg, RC); 6635 if (SrcReg0.isVirtual()) 6636 MRI.constrainRegClass(SrcReg0, RC); 6637 if (SrcReg1.isVirtual()) 6638 MRI.constrainRegClass(SrcReg1, RC); 6639 if (SrcReg2.isVirtual()) 6640 MRI.constrainRegClass(SrcReg2, RC); 6641 6642 MachineInstrBuilder MIB; 6643 if (kind == FMAInstKind::Default) 6644 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6645 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6646 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6647 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6648 else if (kind == FMAInstKind::Indexed) 6649 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6650 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6651 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6652 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6653 .addImm(MUL->getOperand(3).getImm()); 6654 else if (kind == FMAInstKind::Accumulator) 6655 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6656 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 6657 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6658 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 6659 else 6660 assert(false && "Invalid FMA instruction kind \n"); 6661 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 6662 InsInstrs.push_back(MIB); 6663 return MUL; 6664 } 6665 6666 static MachineInstr * 6667 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 6668 const TargetInstrInfo *TII, MachineInstr &Root, 6669 SmallVectorImpl<MachineInstr *> &InsInstrs) { 6670 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 6671 6672 unsigned Opc = 0; 6673 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 6674 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 6675 Opc = AArch64::FNMADDSrrr; 6676 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 6677 Opc = AArch64::FNMADDDrrr; 6678 else 6679 return nullptr; 6680 6681 Register ResultReg = Root.getOperand(0).getReg(); 6682 Register SrcReg0 = MAD->getOperand(1).getReg(); 6683 Register SrcReg1 = MAD->getOperand(2).getReg(); 6684 Register SrcReg2 = MAD->getOperand(3).getReg(); 6685 bool Src0IsKill = MAD->getOperand(1).isKill(); 6686 bool Src1IsKill = MAD->getOperand(2).isKill(); 6687 bool Src2IsKill = MAD->getOperand(3).isKill(); 6688 if (ResultReg.isVirtual()) 6689 MRI.constrainRegClass(ResultReg, RC); 6690 if (SrcReg0.isVirtual()) 6691 MRI.constrainRegClass(SrcReg0, RC); 6692 if (SrcReg1.isVirtual()) 6693 MRI.constrainRegClass(SrcReg1, RC); 6694 if (SrcReg2.isVirtual()) 6695 MRI.constrainRegClass(SrcReg2, RC); 6696 6697 MachineInstrBuilder MIB = 6698 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 6699 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6700 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6701 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 6702 InsInstrs.push_back(MIB); 6703 6704 return MAD; 6705 } 6706 6707 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 6708 static MachineInstr * 6709 genIndexedMultiply(MachineInstr &Root, 6710 SmallVectorImpl<MachineInstr *> &InsInstrs, 6711 unsigned IdxDupOp, unsigned MulOpc, 6712 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 6713 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 6714 "Invalid index of FMUL operand"); 6715 6716 MachineFunction &MF = *Root.getMF(); 6717 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6718 6719 MachineInstr *Dup = 6720 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 6721 6722 if (Dup->getOpcode() == TargetOpcode::COPY) 6723 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 6724 6725 Register DupSrcReg = Dup->getOperand(1).getReg(); 6726 MRI.clearKillFlags(DupSrcReg); 6727 MRI.constrainRegClass(DupSrcReg, RC); 6728 6729 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 6730 6731 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 6732 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 6733 6734 Register ResultReg = Root.getOperand(0).getReg(); 6735 6736 MachineInstrBuilder MIB; 6737 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 6738 .add(MulOp) 6739 .addReg(DupSrcReg) 6740 .addImm(DupSrcLane); 6741 6742 InsInstrs.push_back(MIB); 6743 return &Root; 6744 } 6745 6746 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 6747 /// instructions. 6748 /// 6749 /// \see genFusedMultiply 6750 static MachineInstr *genFusedMultiplyAcc( 6751 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6752 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6753 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6754 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6755 FMAInstKind::Accumulator); 6756 } 6757 6758 /// genNeg - Helper to generate an intermediate negation of the second operand 6759 /// of Root 6760 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 6761 const TargetInstrInfo *TII, MachineInstr &Root, 6762 SmallVectorImpl<MachineInstr *> &InsInstrs, 6763 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 6764 unsigned MnegOpc, const TargetRegisterClass *RC) { 6765 Register NewVR = MRI.createVirtualRegister(RC); 6766 MachineInstrBuilder MIB = 6767 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 6768 .add(Root.getOperand(2)); 6769 InsInstrs.push_back(MIB); 6770 6771 assert(InstrIdxForVirtReg.empty()); 6772 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6773 6774 return NewVR; 6775 } 6776 6777 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6778 /// instructions with an additional negation of the accumulator 6779 static MachineInstr *genFusedMultiplyAccNeg( 6780 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6781 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6782 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6783 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6784 assert(IdxMulOpd == 1); 6785 6786 Register NewVR = 6787 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6788 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6789 FMAInstKind::Accumulator, &NewVR); 6790 } 6791 6792 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 6793 /// instructions. 6794 /// 6795 /// \see genFusedMultiply 6796 static MachineInstr *genFusedMultiplyIdx( 6797 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6798 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6799 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 6800 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6801 FMAInstKind::Indexed); 6802 } 6803 6804 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 6805 /// instructions with an additional negation of the accumulator 6806 static MachineInstr *genFusedMultiplyIdxNeg( 6807 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 6808 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 6809 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 6810 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 6811 assert(IdxMulOpd == 1); 6812 6813 Register NewVR = 6814 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 6815 6816 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 6817 FMAInstKind::Indexed, &NewVR); 6818 } 6819 6820 /// genMaddR - Generate madd instruction and combine mul and add using 6821 /// an extra virtual register 6822 /// Example - an ADD intermediate needs to be stored in a register: 6823 /// MUL I=A,B,0 6824 /// ADD R,I,Imm 6825 /// ==> ORR V, ZR, Imm 6826 /// ==> MADD R,A,B,V 6827 /// \param MF Containing MachineFunction 6828 /// \param MRI Register information 6829 /// \param TII Target information 6830 /// \param Root is the ADD instruction 6831 /// \param [out] InsInstrs is a vector of machine instructions and will 6832 /// contain the generated madd instruction 6833 /// \param IdxMulOpd is index of operand in Root that is the result of 6834 /// the MUL. In the example above IdxMulOpd is 1. 6835 /// \param MaddOpc the opcode fo the madd instruction 6836 /// \param VR is a virtual register that holds the value of an ADD operand 6837 /// (V in the example above). 6838 /// \param RC Register class of operands 6839 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 6840 const TargetInstrInfo *TII, MachineInstr &Root, 6841 SmallVectorImpl<MachineInstr *> &InsInstrs, 6842 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 6843 const TargetRegisterClass *RC) { 6844 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 6845 6846 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 6847 Register ResultReg = Root.getOperand(0).getReg(); 6848 Register SrcReg0 = MUL->getOperand(1).getReg(); 6849 bool Src0IsKill = MUL->getOperand(1).isKill(); 6850 Register SrcReg1 = MUL->getOperand(2).getReg(); 6851 bool Src1IsKill = MUL->getOperand(2).isKill(); 6852 6853 if (ResultReg.isVirtual()) 6854 MRI.constrainRegClass(ResultReg, RC); 6855 if (SrcReg0.isVirtual()) 6856 MRI.constrainRegClass(SrcReg0, RC); 6857 if (SrcReg1.isVirtual()) 6858 MRI.constrainRegClass(SrcReg1, RC); 6859 if (Register::isVirtualRegister(VR)) 6860 MRI.constrainRegClass(VR, RC); 6861 6862 MachineInstrBuilder MIB = 6863 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 6864 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 6865 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 6866 .addReg(VR); 6867 // Insert the MADD 6868 InsInstrs.push_back(MIB); 6869 return MUL; 6870 } 6871 6872 /// Do the following transformation 6873 /// A - (B + C) ==> (A - B) - C 6874 /// A - (B + C) ==> (A - C) - B 6875 static void 6876 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 6877 const TargetInstrInfo *TII, MachineInstr &Root, 6878 SmallVectorImpl<MachineInstr *> &InsInstrs, 6879 SmallVectorImpl<MachineInstr *> &DelInstrs, 6880 unsigned IdxOpd1, 6881 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 6882 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 6883 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 6884 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 6885 6886 Register ResultReg = Root.getOperand(0).getReg(); 6887 Register RegA = Root.getOperand(1).getReg(); 6888 bool RegAIsKill = Root.getOperand(1).isKill(); 6889 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 6890 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 6891 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 6892 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 6893 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 6894 6895 unsigned Opcode = Root.getOpcode(); 6896 if (Opcode == AArch64::SUBSWrr) 6897 Opcode = AArch64::SUBWrr; 6898 else if (Opcode == AArch64::SUBSXrr) 6899 Opcode = AArch64::SUBXrr; 6900 else 6901 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 6902 "Unexpected instruction opcode."); 6903 6904 MachineInstrBuilder MIB1 = 6905 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 6906 .addReg(RegA, getKillRegState(RegAIsKill)) 6907 .addReg(RegB, getKillRegState(RegBIsKill)); 6908 MachineInstrBuilder MIB2 = 6909 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 6910 .addReg(NewVR, getKillRegState(true)) 6911 .addReg(RegC, getKillRegState(RegCIsKill)); 6912 6913 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6914 InsInstrs.push_back(MIB1); 6915 InsInstrs.push_back(MIB2); 6916 DelInstrs.push_back(AddMI); 6917 } 6918 6919 /// When getMachineCombinerPatterns() finds potential patterns, 6920 /// this function generates the instructions that could replace the 6921 /// original code sequence 6922 void AArch64InstrInfo::genAlternativeCodeSequence( 6923 MachineInstr &Root, MachineCombinerPattern Pattern, 6924 SmallVectorImpl<MachineInstr *> &InsInstrs, 6925 SmallVectorImpl<MachineInstr *> &DelInstrs, 6926 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 6927 MachineBasicBlock &MBB = *Root.getParent(); 6928 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6929 MachineFunction &MF = *MBB.getParent(); 6930 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 6931 6932 MachineInstr *MUL = nullptr; 6933 const TargetRegisterClass *RC; 6934 unsigned Opc; 6935 switch (Pattern) { 6936 default: 6937 // Reassociate instructions. 6938 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 6939 DelInstrs, InstrIdxForVirtReg); 6940 return; 6941 case MachineCombinerPattern::SUBADD_OP1: 6942 // A - (B + C) 6943 // ==> (A - B) - C 6944 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 6945 InstrIdxForVirtReg); 6946 break; 6947 case MachineCombinerPattern::SUBADD_OP2: 6948 // A - (B + C) 6949 // ==> (A - C) - B 6950 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 6951 InstrIdxForVirtReg); 6952 break; 6953 case MachineCombinerPattern::MULADDW_OP1: 6954 case MachineCombinerPattern::MULADDX_OP1: 6955 // MUL I=A,B,0 6956 // ADD R,I,C 6957 // ==> MADD R,A,B,C 6958 // --- Create(MADD); 6959 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 6960 Opc = AArch64::MADDWrrr; 6961 RC = &AArch64::GPR32RegClass; 6962 } else { 6963 Opc = AArch64::MADDXrrr; 6964 RC = &AArch64::GPR64RegClass; 6965 } 6966 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6967 break; 6968 case MachineCombinerPattern::MULADDW_OP2: 6969 case MachineCombinerPattern::MULADDX_OP2: 6970 // MUL I=A,B,0 6971 // ADD R,C,I 6972 // ==> MADD R,A,B,C 6973 // --- Create(MADD); 6974 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 6975 Opc = AArch64::MADDWrrr; 6976 RC = &AArch64::GPR32RegClass; 6977 } else { 6978 Opc = AArch64::MADDXrrr; 6979 RC = &AArch64::GPR64RegClass; 6980 } 6981 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6982 break; 6983 case MachineCombinerPattern::MULADDWI_OP1: 6984 case MachineCombinerPattern::MULADDXI_OP1: { 6985 // MUL I=A,B,0 6986 // ADD R,I,Imm 6987 // ==> MOV V, Imm 6988 // ==> MADD R,A,B,V 6989 // --- Create(MADD); 6990 const TargetRegisterClass *OrrRC; 6991 unsigned BitSize, OrrOpc, ZeroReg; 6992 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 6993 OrrOpc = AArch64::ORRWri; 6994 OrrRC = &AArch64::GPR32spRegClass; 6995 BitSize = 32; 6996 ZeroReg = AArch64::WZR; 6997 Opc = AArch64::MADDWrrr; 6998 RC = &AArch64::GPR32RegClass; 6999 } else { 7000 OrrOpc = AArch64::ORRXri; 7001 OrrRC = &AArch64::GPR64spRegClass; 7002 BitSize = 64; 7003 ZeroReg = AArch64::XZR; 7004 Opc = AArch64::MADDXrrr; 7005 RC = &AArch64::GPR64RegClass; 7006 } 7007 Register NewVR = MRI.createVirtualRegister(OrrRC); 7008 uint64_t Imm = Root.getOperand(2).getImm(); 7009 7010 if (Root.getOperand(3).isImm()) { 7011 unsigned Val = Root.getOperand(3).getImm(); 7012 Imm = Imm << Val; 7013 } 7014 uint64_t UImm = SignExtend64(Imm, BitSize); 7015 // The immediate can be composed via a single instruction. 7016 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7017 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7018 if (Insn.size() != 1) 7019 return; 7020 auto MovI = Insn.begin(); 7021 MachineInstrBuilder MIB1; 7022 // MOV is an alias for one of three instructions: movz, movn, and orr. 7023 if (MovI->Opcode == OrrOpc) 7024 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7025 .addReg(ZeroReg) 7026 .addImm(MovI->Op2); 7027 else { 7028 if (BitSize == 32) 7029 assert((MovI->Opcode == AArch64::MOVNWi || 7030 MovI->Opcode == AArch64::MOVZWi) && 7031 "Expected opcode"); 7032 else 7033 assert((MovI->Opcode == AArch64::MOVNXi || 7034 MovI->Opcode == AArch64::MOVZXi) && 7035 "Expected opcode"); 7036 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7037 .addImm(MovI->Op1) 7038 .addImm(MovI->Op2); 7039 } 7040 InsInstrs.push_back(MIB1); 7041 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7042 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7043 break; 7044 } 7045 case MachineCombinerPattern::MULSUBW_OP1: 7046 case MachineCombinerPattern::MULSUBX_OP1: { 7047 // MUL I=A,B,0 7048 // SUB R,I, C 7049 // ==> SUB V, 0, C 7050 // ==> MADD R,A,B,V // = -C + A*B 7051 // --- Create(MADD); 7052 const TargetRegisterClass *SubRC; 7053 unsigned SubOpc, ZeroReg; 7054 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 7055 SubOpc = AArch64::SUBWrr; 7056 SubRC = &AArch64::GPR32spRegClass; 7057 ZeroReg = AArch64::WZR; 7058 Opc = AArch64::MADDWrrr; 7059 RC = &AArch64::GPR32RegClass; 7060 } else { 7061 SubOpc = AArch64::SUBXrr; 7062 SubRC = &AArch64::GPR64spRegClass; 7063 ZeroReg = AArch64::XZR; 7064 Opc = AArch64::MADDXrrr; 7065 RC = &AArch64::GPR64RegClass; 7066 } 7067 Register NewVR = MRI.createVirtualRegister(SubRC); 7068 // SUB NewVR, 0, C 7069 MachineInstrBuilder MIB1 = 7070 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 7071 .addReg(ZeroReg) 7072 .add(Root.getOperand(2)); 7073 InsInstrs.push_back(MIB1); 7074 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7075 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7076 break; 7077 } 7078 case MachineCombinerPattern::MULSUBW_OP2: 7079 case MachineCombinerPattern::MULSUBX_OP2: 7080 // MUL I=A,B,0 7081 // SUB R,C,I 7082 // ==> MSUB R,A,B,C (computes C - A*B) 7083 // --- Create(MSUB); 7084 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 7085 Opc = AArch64::MSUBWrrr; 7086 RC = &AArch64::GPR32RegClass; 7087 } else { 7088 Opc = AArch64::MSUBXrrr; 7089 RC = &AArch64::GPR64RegClass; 7090 } 7091 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7092 break; 7093 case MachineCombinerPattern::MULSUBWI_OP1: 7094 case MachineCombinerPattern::MULSUBXI_OP1: { 7095 // MUL I=A,B,0 7096 // SUB R,I, Imm 7097 // ==> MOV V, -Imm 7098 // ==> MADD R,A,B,V // = -Imm + A*B 7099 // --- Create(MADD); 7100 const TargetRegisterClass *OrrRC; 7101 unsigned BitSize, OrrOpc, ZeroReg; 7102 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 7103 OrrOpc = AArch64::ORRWri; 7104 OrrRC = &AArch64::GPR32spRegClass; 7105 BitSize = 32; 7106 ZeroReg = AArch64::WZR; 7107 Opc = AArch64::MADDWrrr; 7108 RC = &AArch64::GPR32RegClass; 7109 } else { 7110 OrrOpc = AArch64::ORRXri; 7111 OrrRC = &AArch64::GPR64spRegClass; 7112 BitSize = 64; 7113 ZeroReg = AArch64::XZR; 7114 Opc = AArch64::MADDXrrr; 7115 RC = &AArch64::GPR64RegClass; 7116 } 7117 Register NewVR = MRI.createVirtualRegister(OrrRC); 7118 uint64_t Imm = Root.getOperand(2).getImm(); 7119 if (Root.getOperand(3).isImm()) { 7120 unsigned Val = Root.getOperand(3).getImm(); 7121 Imm = Imm << Val; 7122 } 7123 uint64_t UImm = SignExtend64(-Imm, BitSize); 7124 // The immediate can be composed via a single instruction. 7125 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 7126 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 7127 if (Insn.size() != 1) 7128 return; 7129 auto MovI = Insn.begin(); 7130 MachineInstrBuilder MIB1; 7131 // MOV is an alias for one of three instructions: movz, movn, and orr. 7132 if (MovI->Opcode == OrrOpc) 7133 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 7134 .addReg(ZeroReg) 7135 .addImm(MovI->Op2); 7136 else { 7137 if (BitSize == 32) 7138 assert((MovI->Opcode == AArch64::MOVNWi || 7139 MovI->Opcode == AArch64::MOVZWi) && 7140 "Expected opcode"); 7141 else 7142 assert((MovI->Opcode == AArch64::MOVNXi || 7143 MovI->Opcode == AArch64::MOVZXi) && 7144 "Expected opcode"); 7145 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 7146 .addImm(MovI->Op1) 7147 .addImm(MovI->Op2); 7148 } 7149 InsInstrs.push_back(MIB1); 7150 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7151 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 7152 break; 7153 } 7154 7155 case MachineCombinerPattern::MULADDv8i8_OP1: 7156 Opc = AArch64::MLAv8i8; 7157 RC = &AArch64::FPR64RegClass; 7158 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7159 break; 7160 case MachineCombinerPattern::MULADDv8i8_OP2: 7161 Opc = AArch64::MLAv8i8; 7162 RC = &AArch64::FPR64RegClass; 7163 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7164 break; 7165 case MachineCombinerPattern::MULADDv16i8_OP1: 7166 Opc = AArch64::MLAv16i8; 7167 RC = &AArch64::FPR128RegClass; 7168 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7169 break; 7170 case MachineCombinerPattern::MULADDv16i8_OP2: 7171 Opc = AArch64::MLAv16i8; 7172 RC = &AArch64::FPR128RegClass; 7173 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7174 break; 7175 case MachineCombinerPattern::MULADDv4i16_OP1: 7176 Opc = AArch64::MLAv4i16; 7177 RC = &AArch64::FPR64RegClass; 7178 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7179 break; 7180 case MachineCombinerPattern::MULADDv4i16_OP2: 7181 Opc = AArch64::MLAv4i16; 7182 RC = &AArch64::FPR64RegClass; 7183 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7184 break; 7185 case MachineCombinerPattern::MULADDv8i16_OP1: 7186 Opc = AArch64::MLAv8i16; 7187 RC = &AArch64::FPR128RegClass; 7188 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7189 break; 7190 case MachineCombinerPattern::MULADDv8i16_OP2: 7191 Opc = AArch64::MLAv8i16; 7192 RC = &AArch64::FPR128RegClass; 7193 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7194 break; 7195 case MachineCombinerPattern::MULADDv2i32_OP1: 7196 Opc = AArch64::MLAv2i32; 7197 RC = &AArch64::FPR64RegClass; 7198 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7199 break; 7200 case MachineCombinerPattern::MULADDv2i32_OP2: 7201 Opc = AArch64::MLAv2i32; 7202 RC = &AArch64::FPR64RegClass; 7203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7204 break; 7205 case MachineCombinerPattern::MULADDv4i32_OP1: 7206 Opc = AArch64::MLAv4i32; 7207 RC = &AArch64::FPR128RegClass; 7208 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7209 break; 7210 case MachineCombinerPattern::MULADDv4i32_OP2: 7211 Opc = AArch64::MLAv4i32; 7212 RC = &AArch64::FPR128RegClass; 7213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7214 break; 7215 7216 case MachineCombinerPattern::MULSUBv8i8_OP1: 7217 Opc = AArch64::MLAv8i8; 7218 RC = &AArch64::FPR64RegClass; 7219 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7220 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 7221 RC); 7222 break; 7223 case MachineCombinerPattern::MULSUBv8i8_OP2: 7224 Opc = AArch64::MLSv8i8; 7225 RC = &AArch64::FPR64RegClass; 7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7227 break; 7228 case MachineCombinerPattern::MULSUBv16i8_OP1: 7229 Opc = AArch64::MLAv16i8; 7230 RC = &AArch64::FPR128RegClass; 7231 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7232 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 7233 RC); 7234 break; 7235 case MachineCombinerPattern::MULSUBv16i8_OP2: 7236 Opc = AArch64::MLSv16i8; 7237 RC = &AArch64::FPR128RegClass; 7238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7239 break; 7240 case MachineCombinerPattern::MULSUBv4i16_OP1: 7241 Opc = AArch64::MLAv4i16; 7242 RC = &AArch64::FPR64RegClass; 7243 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7244 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7245 RC); 7246 break; 7247 case MachineCombinerPattern::MULSUBv4i16_OP2: 7248 Opc = AArch64::MLSv4i16; 7249 RC = &AArch64::FPR64RegClass; 7250 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7251 break; 7252 case MachineCombinerPattern::MULSUBv8i16_OP1: 7253 Opc = AArch64::MLAv8i16; 7254 RC = &AArch64::FPR128RegClass; 7255 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7256 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7257 RC); 7258 break; 7259 case MachineCombinerPattern::MULSUBv8i16_OP2: 7260 Opc = AArch64::MLSv8i16; 7261 RC = &AArch64::FPR128RegClass; 7262 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7263 break; 7264 case MachineCombinerPattern::MULSUBv2i32_OP1: 7265 Opc = AArch64::MLAv2i32; 7266 RC = &AArch64::FPR64RegClass; 7267 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7268 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7269 RC); 7270 break; 7271 case MachineCombinerPattern::MULSUBv2i32_OP2: 7272 Opc = AArch64::MLSv2i32; 7273 RC = &AArch64::FPR64RegClass; 7274 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7275 break; 7276 case MachineCombinerPattern::MULSUBv4i32_OP1: 7277 Opc = AArch64::MLAv4i32; 7278 RC = &AArch64::FPR128RegClass; 7279 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 7280 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7281 RC); 7282 break; 7283 case MachineCombinerPattern::MULSUBv4i32_OP2: 7284 Opc = AArch64::MLSv4i32; 7285 RC = &AArch64::FPR128RegClass; 7286 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7287 break; 7288 7289 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 7290 Opc = AArch64::MLAv4i16_indexed; 7291 RC = &AArch64::FPR64RegClass; 7292 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7293 break; 7294 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 7295 Opc = AArch64::MLAv4i16_indexed; 7296 RC = &AArch64::FPR64RegClass; 7297 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7298 break; 7299 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 7300 Opc = AArch64::MLAv8i16_indexed; 7301 RC = &AArch64::FPR128RegClass; 7302 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7303 break; 7304 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 7305 Opc = AArch64::MLAv8i16_indexed; 7306 RC = &AArch64::FPR128RegClass; 7307 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7308 break; 7309 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 7310 Opc = AArch64::MLAv2i32_indexed; 7311 RC = &AArch64::FPR64RegClass; 7312 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7313 break; 7314 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 7315 Opc = AArch64::MLAv2i32_indexed; 7316 RC = &AArch64::FPR64RegClass; 7317 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7318 break; 7319 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 7320 Opc = AArch64::MLAv4i32_indexed; 7321 RC = &AArch64::FPR128RegClass; 7322 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7323 break; 7324 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 7325 Opc = AArch64::MLAv4i32_indexed; 7326 RC = &AArch64::FPR128RegClass; 7327 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7328 break; 7329 7330 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 7331 Opc = AArch64::MLAv4i16_indexed; 7332 RC = &AArch64::FPR64RegClass; 7333 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7334 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 7335 RC); 7336 break; 7337 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 7338 Opc = AArch64::MLSv4i16_indexed; 7339 RC = &AArch64::FPR64RegClass; 7340 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7341 break; 7342 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 7343 Opc = AArch64::MLAv8i16_indexed; 7344 RC = &AArch64::FPR128RegClass; 7345 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7346 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 7347 RC); 7348 break; 7349 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 7350 Opc = AArch64::MLSv8i16_indexed; 7351 RC = &AArch64::FPR128RegClass; 7352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7353 break; 7354 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 7355 Opc = AArch64::MLAv2i32_indexed; 7356 RC = &AArch64::FPR64RegClass; 7357 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7358 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 7359 RC); 7360 break; 7361 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 7362 Opc = AArch64::MLSv2i32_indexed; 7363 RC = &AArch64::FPR64RegClass; 7364 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7365 break; 7366 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 7367 Opc = AArch64::MLAv4i32_indexed; 7368 RC = &AArch64::FPR128RegClass; 7369 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 7370 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 7371 RC); 7372 break; 7373 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 7374 Opc = AArch64::MLSv4i32_indexed; 7375 RC = &AArch64::FPR128RegClass; 7376 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7377 break; 7378 7379 // Floating Point Support 7380 case MachineCombinerPattern::FMULADDH_OP1: 7381 Opc = AArch64::FMADDHrrr; 7382 RC = &AArch64::FPR16RegClass; 7383 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7384 break; 7385 case MachineCombinerPattern::FMULADDS_OP1: 7386 Opc = AArch64::FMADDSrrr; 7387 RC = &AArch64::FPR32RegClass; 7388 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7389 break; 7390 case MachineCombinerPattern::FMULADDD_OP1: 7391 Opc = AArch64::FMADDDrrr; 7392 RC = &AArch64::FPR64RegClass; 7393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7394 break; 7395 7396 case MachineCombinerPattern::FMULADDH_OP2: 7397 Opc = AArch64::FMADDHrrr; 7398 RC = &AArch64::FPR16RegClass; 7399 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7400 break; 7401 case MachineCombinerPattern::FMULADDS_OP2: 7402 Opc = AArch64::FMADDSrrr; 7403 RC = &AArch64::FPR32RegClass; 7404 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7405 break; 7406 case MachineCombinerPattern::FMULADDD_OP2: 7407 Opc = AArch64::FMADDDrrr; 7408 RC = &AArch64::FPR64RegClass; 7409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7410 break; 7411 7412 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 7413 Opc = AArch64::FMLAv1i32_indexed; 7414 RC = &AArch64::FPR32RegClass; 7415 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7416 FMAInstKind::Indexed); 7417 break; 7418 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 7419 Opc = AArch64::FMLAv1i32_indexed; 7420 RC = &AArch64::FPR32RegClass; 7421 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7422 FMAInstKind::Indexed); 7423 break; 7424 7425 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 7426 Opc = AArch64::FMLAv1i64_indexed; 7427 RC = &AArch64::FPR64RegClass; 7428 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7429 FMAInstKind::Indexed); 7430 break; 7431 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 7432 Opc = AArch64::FMLAv1i64_indexed; 7433 RC = &AArch64::FPR64RegClass; 7434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7435 FMAInstKind::Indexed); 7436 break; 7437 7438 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 7439 RC = &AArch64::FPR64RegClass; 7440 Opc = AArch64::FMLAv4i16_indexed; 7441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7442 FMAInstKind::Indexed); 7443 break; 7444 case MachineCombinerPattern::FMLAv4f16_OP1: 7445 RC = &AArch64::FPR64RegClass; 7446 Opc = AArch64::FMLAv4f16; 7447 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7448 FMAInstKind::Accumulator); 7449 break; 7450 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 7451 RC = &AArch64::FPR64RegClass; 7452 Opc = AArch64::FMLAv4i16_indexed; 7453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7454 FMAInstKind::Indexed); 7455 break; 7456 case MachineCombinerPattern::FMLAv4f16_OP2: 7457 RC = &AArch64::FPR64RegClass; 7458 Opc = AArch64::FMLAv4f16; 7459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7460 FMAInstKind::Accumulator); 7461 break; 7462 7463 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 7464 case MachineCombinerPattern::FMLAv2f32_OP1: 7465 RC = &AArch64::FPR64RegClass; 7466 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 7467 Opc = AArch64::FMLAv2i32_indexed; 7468 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7469 FMAInstKind::Indexed); 7470 } else { 7471 Opc = AArch64::FMLAv2f32; 7472 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7473 FMAInstKind::Accumulator); 7474 } 7475 break; 7476 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 7477 case MachineCombinerPattern::FMLAv2f32_OP2: 7478 RC = &AArch64::FPR64RegClass; 7479 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 7480 Opc = AArch64::FMLAv2i32_indexed; 7481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7482 FMAInstKind::Indexed); 7483 } else { 7484 Opc = AArch64::FMLAv2f32; 7485 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7486 FMAInstKind::Accumulator); 7487 } 7488 break; 7489 7490 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 7491 RC = &AArch64::FPR128RegClass; 7492 Opc = AArch64::FMLAv8i16_indexed; 7493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7494 FMAInstKind::Indexed); 7495 break; 7496 case MachineCombinerPattern::FMLAv8f16_OP1: 7497 RC = &AArch64::FPR128RegClass; 7498 Opc = AArch64::FMLAv8f16; 7499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7500 FMAInstKind::Accumulator); 7501 break; 7502 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 7503 RC = &AArch64::FPR128RegClass; 7504 Opc = AArch64::FMLAv8i16_indexed; 7505 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7506 FMAInstKind::Indexed); 7507 break; 7508 case MachineCombinerPattern::FMLAv8f16_OP2: 7509 RC = &AArch64::FPR128RegClass; 7510 Opc = AArch64::FMLAv8f16; 7511 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7512 FMAInstKind::Accumulator); 7513 break; 7514 7515 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 7516 case MachineCombinerPattern::FMLAv2f64_OP1: 7517 RC = &AArch64::FPR128RegClass; 7518 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 7519 Opc = AArch64::FMLAv2i64_indexed; 7520 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7521 FMAInstKind::Indexed); 7522 } else { 7523 Opc = AArch64::FMLAv2f64; 7524 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7525 FMAInstKind::Accumulator); 7526 } 7527 break; 7528 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 7529 case MachineCombinerPattern::FMLAv2f64_OP2: 7530 RC = &AArch64::FPR128RegClass; 7531 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 7532 Opc = AArch64::FMLAv2i64_indexed; 7533 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7534 FMAInstKind::Indexed); 7535 } else { 7536 Opc = AArch64::FMLAv2f64; 7537 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7538 FMAInstKind::Accumulator); 7539 } 7540 break; 7541 7542 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 7543 case MachineCombinerPattern::FMLAv4f32_OP1: 7544 RC = &AArch64::FPR128RegClass; 7545 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 7546 Opc = AArch64::FMLAv4i32_indexed; 7547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7548 FMAInstKind::Indexed); 7549 } else { 7550 Opc = AArch64::FMLAv4f32; 7551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7552 FMAInstKind::Accumulator); 7553 } 7554 break; 7555 7556 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 7557 case MachineCombinerPattern::FMLAv4f32_OP2: 7558 RC = &AArch64::FPR128RegClass; 7559 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 7560 Opc = AArch64::FMLAv4i32_indexed; 7561 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7562 FMAInstKind::Indexed); 7563 } else { 7564 Opc = AArch64::FMLAv4f32; 7565 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7566 FMAInstKind::Accumulator); 7567 } 7568 break; 7569 7570 case MachineCombinerPattern::FMULSUBH_OP1: 7571 Opc = AArch64::FNMSUBHrrr; 7572 RC = &AArch64::FPR16RegClass; 7573 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7574 break; 7575 case MachineCombinerPattern::FMULSUBS_OP1: 7576 Opc = AArch64::FNMSUBSrrr; 7577 RC = &AArch64::FPR32RegClass; 7578 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7579 break; 7580 case MachineCombinerPattern::FMULSUBD_OP1: 7581 Opc = AArch64::FNMSUBDrrr; 7582 RC = &AArch64::FPR64RegClass; 7583 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7584 break; 7585 7586 case MachineCombinerPattern::FNMULSUBH_OP1: 7587 Opc = AArch64::FNMADDHrrr; 7588 RC = &AArch64::FPR16RegClass; 7589 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7590 break; 7591 case MachineCombinerPattern::FNMULSUBS_OP1: 7592 Opc = AArch64::FNMADDSrrr; 7593 RC = &AArch64::FPR32RegClass; 7594 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7595 break; 7596 case MachineCombinerPattern::FNMULSUBD_OP1: 7597 Opc = AArch64::FNMADDDrrr; 7598 RC = &AArch64::FPR64RegClass; 7599 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 7600 break; 7601 7602 case MachineCombinerPattern::FMULSUBH_OP2: 7603 Opc = AArch64::FMSUBHrrr; 7604 RC = &AArch64::FPR16RegClass; 7605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7606 break; 7607 case MachineCombinerPattern::FMULSUBS_OP2: 7608 Opc = AArch64::FMSUBSrrr; 7609 RC = &AArch64::FPR32RegClass; 7610 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7611 break; 7612 case MachineCombinerPattern::FMULSUBD_OP2: 7613 Opc = AArch64::FMSUBDrrr; 7614 RC = &AArch64::FPR64RegClass; 7615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 7616 break; 7617 7618 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 7619 Opc = AArch64::FMLSv1i32_indexed; 7620 RC = &AArch64::FPR32RegClass; 7621 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7622 FMAInstKind::Indexed); 7623 break; 7624 7625 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 7626 Opc = AArch64::FMLSv1i64_indexed; 7627 RC = &AArch64::FPR64RegClass; 7628 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7629 FMAInstKind::Indexed); 7630 break; 7631 7632 case MachineCombinerPattern::FMLSv4f16_OP1: 7633 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 7634 RC = &AArch64::FPR64RegClass; 7635 Register NewVR = MRI.createVirtualRegister(RC); 7636 MachineInstrBuilder MIB1 = 7637 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 7638 .add(Root.getOperand(2)); 7639 InsInstrs.push_back(MIB1); 7640 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7641 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 7642 Opc = AArch64::FMLAv4f16; 7643 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7644 FMAInstKind::Accumulator, &NewVR); 7645 } else { 7646 Opc = AArch64::FMLAv4i16_indexed; 7647 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7648 FMAInstKind::Indexed, &NewVR); 7649 } 7650 break; 7651 } 7652 case MachineCombinerPattern::FMLSv4f16_OP2: 7653 RC = &AArch64::FPR64RegClass; 7654 Opc = AArch64::FMLSv4f16; 7655 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7656 FMAInstKind::Accumulator); 7657 break; 7658 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 7659 RC = &AArch64::FPR64RegClass; 7660 Opc = AArch64::FMLSv4i16_indexed; 7661 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7662 FMAInstKind::Indexed); 7663 break; 7664 7665 case MachineCombinerPattern::FMLSv2f32_OP2: 7666 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 7667 RC = &AArch64::FPR64RegClass; 7668 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 7669 Opc = AArch64::FMLSv2i32_indexed; 7670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7671 FMAInstKind::Indexed); 7672 } else { 7673 Opc = AArch64::FMLSv2f32; 7674 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7675 FMAInstKind::Accumulator); 7676 } 7677 break; 7678 7679 case MachineCombinerPattern::FMLSv8f16_OP1: 7680 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 7681 RC = &AArch64::FPR128RegClass; 7682 Register NewVR = MRI.createVirtualRegister(RC); 7683 MachineInstrBuilder MIB1 = 7684 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 7685 .add(Root.getOperand(2)); 7686 InsInstrs.push_back(MIB1); 7687 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7688 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 7689 Opc = AArch64::FMLAv8f16; 7690 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7691 FMAInstKind::Accumulator, &NewVR); 7692 } else { 7693 Opc = AArch64::FMLAv8i16_indexed; 7694 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7695 FMAInstKind::Indexed, &NewVR); 7696 } 7697 break; 7698 } 7699 case MachineCombinerPattern::FMLSv8f16_OP2: 7700 RC = &AArch64::FPR128RegClass; 7701 Opc = AArch64::FMLSv8f16; 7702 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7703 FMAInstKind::Accumulator); 7704 break; 7705 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 7706 RC = &AArch64::FPR128RegClass; 7707 Opc = AArch64::FMLSv8i16_indexed; 7708 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7709 FMAInstKind::Indexed); 7710 break; 7711 7712 case MachineCombinerPattern::FMLSv2f64_OP2: 7713 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 7714 RC = &AArch64::FPR128RegClass; 7715 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 7716 Opc = AArch64::FMLSv2i64_indexed; 7717 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7718 FMAInstKind::Indexed); 7719 } else { 7720 Opc = AArch64::FMLSv2f64; 7721 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7722 FMAInstKind::Accumulator); 7723 } 7724 break; 7725 7726 case MachineCombinerPattern::FMLSv4f32_OP2: 7727 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 7728 RC = &AArch64::FPR128RegClass; 7729 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 7730 Opc = AArch64::FMLSv4i32_indexed; 7731 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7732 FMAInstKind::Indexed); 7733 } else { 7734 Opc = AArch64::FMLSv4f32; 7735 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 7736 FMAInstKind::Accumulator); 7737 } 7738 break; 7739 case MachineCombinerPattern::FMLSv2f32_OP1: 7740 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 7741 RC = &AArch64::FPR64RegClass; 7742 Register NewVR = MRI.createVirtualRegister(RC); 7743 MachineInstrBuilder MIB1 = 7744 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 7745 .add(Root.getOperand(2)); 7746 InsInstrs.push_back(MIB1); 7747 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7748 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 7749 Opc = AArch64::FMLAv2i32_indexed; 7750 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7751 FMAInstKind::Indexed, &NewVR); 7752 } else { 7753 Opc = AArch64::FMLAv2f32; 7754 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7755 FMAInstKind::Accumulator, &NewVR); 7756 } 7757 break; 7758 } 7759 case MachineCombinerPattern::FMLSv4f32_OP1: 7760 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 7761 RC = &AArch64::FPR128RegClass; 7762 Register NewVR = MRI.createVirtualRegister(RC); 7763 MachineInstrBuilder MIB1 = 7764 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 7765 .add(Root.getOperand(2)); 7766 InsInstrs.push_back(MIB1); 7767 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7768 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 7769 Opc = AArch64::FMLAv4i32_indexed; 7770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7771 FMAInstKind::Indexed, &NewVR); 7772 } else { 7773 Opc = AArch64::FMLAv4f32; 7774 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7775 FMAInstKind::Accumulator, &NewVR); 7776 } 7777 break; 7778 } 7779 case MachineCombinerPattern::FMLSv2f64_OP1: 7780 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 7781 RC = &AArch64::FPR128RegClass; 7782 Register NewVR = MRI.createVirtualRegister(RC); 7783 MachineInstrBuilder MIB1 = 7784 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 7785 .add(Root.getOperand(2)); 7786 InsInstrs.push_back(MIB1); 7787 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 7788 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 7789 Opc = AArch64::FMLAv2i64_indexed; 7790 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7791 FMAInstKind::Indexed, &NewVR); 7792 } else { 7793 Opc = AArch64::FMLAv2f64; 7794 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 7795 FMAInstKind::Accumulator, &NewVR); 7796 } 7797 break; 7798 } 7799 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 7800 case MachineCombinerPattern::FMULv2i32_indexed_OP2: { 7801 unsigned IdxDupOp = 7802 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; 7803 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 7804 &AArch64::FPR128RegClass, MRI); 7805 break; 7806 } 7807 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 7808 case MachineCombinerPattern::FMULv2i64_indexed_OP2: { 7809 unsigned IdxDupOp = 7810 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; 7811 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 7812 &AArch64::FPR128RegClass, MRI); 7813 break; 7814 } 7815 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 7816 case MachineCombinerPattern::FMULv4i16_indexed_OP2: { 7817 unsigned IdxDupOp = 7818 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; 7819 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 7820 &AArch64::FPR128_loRegClass, MRI); 7821 break; 7822 } 7823 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 7824 case MachineCombinerPattern::FMULv4i32_indexed_OP2: { 7825 unsigned IdxDupOp = 7826 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; 7827 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 7828 &AArch64::FPR128RegClass, MRI); 7829 break; 7830 } 7831 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 7832 case MachineCombinerPattern::FMULv8i16_indexed_OP2: { 7833 unsigned IdxDupOp = 7834 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; 7835 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 7836 &AArch64::FPR128_loRegClass, MRI); 7837 break; 7838 } 7839 case MachineCombinerPattern::FNMADD: { 7840 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 7841 break; 7842 } 7843 7844 } // end switch (Pattern) 7845 // Record MUL and ADD/SUB for deletion 7846 if (MUL) 7847 DelInstrs.push_back(MUL); 7848 DelInstrs.push_back(&Root); 7849 7850 // Set the flags on the inserted instructions to be the merged flags of the 7851 // instructions that we have combined. 7852 uint32_t Flags = Root.getFlags(); 7853 if (MUL) 7854 Flags = Root.mergeFlagsWith(*MUL); 7855 for (auto *MI : InsInstrs) 7856 MI->setFlags(Flags); 7857 } 7858 7859 /// Replace csincr-branch sequence by simple conditional branch 7860 /// 7861 /// Examples: 7862 /// 1. \code 7863 /// csinc w9, wzr, wzr, <condition code> 7864 /// tbnz w9, #0, 0x44 7865 /// \endcode 7866 /// to 7867 /// \code 7868 /// b.<inverted condition code> 7869 /// \endcode 7870 /// 7871 /// 2. \code 7872 /// csinc w9, wzr, wzr, <condition code> 7873 /// tbz w9, #0, 0x44 7874 /// \endcode 7875 /// to 7876 /// \code 7877 /// b.<condition code> 7878 /// \endcode 7879 /// 7880 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 7881 /// compare's constant operand is power of 2. 7882 /// 7883 /// Examples: 7884 /// \code 7885 /// and w8, w8, #0x400 7886 /// cbnz w8, L1 7887 /// \endcode 7888 /// to 7889 /// \code 7890 /// tbnz w8, #10, L1 7891 /// \endcode 7892 /// 7893 /// \param MI Conditional Branch 7894 /// \return True when the simple conditional branch is generated 7895 /// 7896 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 7897 bool IsNegativeBranch = false; 7898 bool IsTestAndBranch = false; 7899 unsigned TargetBBInMI = 0; 7900 switch (MI.getOpcode()) { 7901 default: 7902 llvm_unreachable("Unknown branch instruction?"); 7903 case AArch64::Bcc: 7904 return false; 7905 case AArch64::CBZW: 7906 case AArch64::CBZX: 7907 TargetBBInMI = 1; 7908 break; 7909 case AArch64::CBNZW: 7910 case AArch64::CBNZX: 7911 TargetBBInMI = 1; 7912 IsNegativeBranch = true; 7913 break; 7914 case AArch64::TBZW: 7915 case AArch64::TBZX: 7916 TargetBBInMI = 2; 7917 IsTestAndBranch = true; 7918 break; 7919 case AArch64::TBNZW: 7920 case AArch64::TBNZX: 7921 TargetBBInMI = 2; 7922 IsNegativeBranch = true; 7923 IsTestAndBranch = true; 7924 break; 7925 } 7926 // So we increment a zero register and test for bits other 7927 // than bit 0? Conservatively bail out in case the verifier 7928 // missed this case. 7929 if (IsTestAndBranch && MI.getOperand(1).getImm()) 7930 return false; 7931 7932 // Find Definition. 7933 assert(MI.getParent() && "Incomplete machine instruciton\n"); 7934 MachineBasicBlock *MBB = MI.getParent(); 7935 MachineFunction *MF = MBB->getParent(); 7936 MachineRegisterInfo *MRI = &MF->getRegInfo(); 7937 Register VReg = MI.getOperand(0).getReg(); 7938 if (!VReg.isVirtual()) 7939 return false; 7940 7941 MachineInstr *DefMI = MRI->getVRegDef(VReg); 7942 7943 // Look through COPY instructions to find definition. 7944 while (DefMI->isCopy()) { 7945 Register CopyVReg = DefMI->getOperand(1).getReg(); 7946 if (!MRI->hasOneNonDBGUse(CopyVReg)) 7947 return false; 7948 if (!MRI->hasOneDef(CopyVReg)) 7949 return false; 7950 DefMI = MRI->getVRegDef(CopyVReg); 7951 } 7952 7953 switch (DefMI->getOpcode()) { 7954 default: 7955 return false; 7956 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 7957 case AArch64::ANDWri: 7958 case AArch64::ANDXri: { 7959 if (IsTestAndBranch) 7960 return false; 7961 if (DefMI->getParent() != MBB) 7962 return false; 7963 if (!MRI->hasOneNonDBGUse(VReg)) 7964 return false; 7965 7966 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 7967 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 7968 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 7969 if (!isPowerOf2_64(Mask)) 7970 return false; 7971 7972 MachineOperand &MO = DefMI->getOperand(1); 7973 Register NewReg = MO.getReg(); 7974 if (!NewReg.isVirtual()) 7975 return false; 7976 7977 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 7978 7979 MachineBasicBlock &RefToMBB = *MBB; 7980 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 7981 DebugLoc DL = MI.getDebugLoc(); 7982 unsigned Imm = Log2_64(Mask); 7983 unsigned Opc = (Imm < 32) 7984 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 7985 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 7986 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 7987 .addReg(NewReg) 7988 .addImm(Imm) 7989 .addMBB(TBB); 7990 // Register lives on to the CBZ now. 7991 MO.setIsKill(false); 7992 7993 // For immediate smaller than 32, we need to use the 32-bit 7994 // variant (W) in all cases. Indeed the 64-bit variant does not 7995 // allow to encode them. 7996 // Therefore, if the input register is 64-bit, we need to take the 7997 // 32-bit sub-part. 7998 if (!Is32Bit && Imm < 32) 7999 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 8000 MI.eraseFromParent(); 8001 return true; 8002 } 8003 // Look for CSINC 8004 case AArch64::CSINCWr: 8005 case AArch64::CSINCXr: { 8006 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 8007 DefMI->getOperand(2).getReg() == AArch64::WZR) && 8008 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 8009 DefMI->getOperand(2).getReg() == AArch64::XZR)) 8010 return false; 8011 8012 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 8013 return false; 8014 8015 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 8016 // Convert only when the condition code is not modified between 8017 // the CSINC and the branch. The CC may be used by other 8018 // instructions in between. 8019 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 8020 return false; 8021 MachineBasicBlock &RefToMBB = *MBB; 8022 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 8023 DebugLoc DL = MI.getDebugLoc(); 8024 if (IsNegativeBranch) 8025 CC = AArch64CC::getInvertedCondCode(CC); 8026 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 8027 MI.eraseFromParent(); 8028 return true; 8029 } 8030 } 8031 } 8032 8033 std::pair<unsigned, unsigned> 8034 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 8035 const unsigned Mask = AArch64II::MO_FRAGMENT; 8036 return std::make_pair(TF & Mask, TF & ~Mask); 8037 } 8038 8039 ArrayRef<std::pair<unsigned, const char *>> 8040 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 8041 using namespace AArch64II; 8042 8043 static const std::pair<unsigned, const char *> TargetFlags[] = { 8044 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 8045 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 8046 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 8047 {MO_HI12, "aarch64-hi12"}}; 8048 return ArrayRef(TargetFlags); 8049 } 8050 8051 ArrayRef<std::pair<unsigned, const char *>> 8052 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 8053 using namespace AArch64II; 8054 8055 static const std::pair<unsigned, const char *> TargetFlags[] = { 8056 {MO_COFFSTUB, "aarch64-coffstub"}, 8057 {MO_GOT, "aarch64-got"}, 8058 {MO_NC, "aarch64-nc"}, 8059 {MO_S, "aarch64-s"}, 8060 {MO_TLS, "aarch64-tls"}, 8061 {MO_DLLIMPORT, "aarch64-dllimport"}, 8062 {MO_PREL, "aarch64-prel"}, 8063 {MO_TAGGED, "aarch64-tagged"}, 8064 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"}, 8065 }; 8066 return ArrayRef(TargetFlags); 8067 } 8068 8069 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 8070 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 8071 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 8072 {{MOSuppressPair, "aarch64-suppress-pair"}, 8073 {MOStridedAccess, "aarch64-strided-access"}}; 8074 return ArrayRef(TargetFlags); 8075 } 8076 8077 /// Constants defining how certain sequences should be outlined. 8078 /// This encompasses how an outlined function should be called, and what kind of 8079 /// frame should be emitted for that outlined function. 8080 /// 8081 /// \p MachineOutlinerDefault implies that the function should be called with 8082 /// a save and restore of LR to the stack. 8083 /// 8084 /// That is, 8085 /// 8086 /// I1 Save LR OUTLINED_FUNCTION: 8087 /// I2 --> BL OUTLINED_FUNCTION I1 8088 /// I3 Restore LR I2 8089 /// I3 8090 /// RET 8091 /// 8092 /// * Call construction overhead: 3 (save + BL + restore) 8093 /// * Frame construction overhead: 1 (ret) 8094 /// * Requires stack fixups? Yes 8095 /// 8096 /// \p MachineOutlinerTailCall implies that the function is being created from 8097 /// a sequence of instructions ending in a return. 8098 /// 8099 /// That is, 8100 /// 8101 /// I1 OUTLINED_FUNCTION: 8102 /// I2 --> B OUTLINED_FUNCTION I1 8103 /// RET I2 8104 /// RET 8105 /// 8106 /// * Call construction overhead: 1 (B) 8107 /// * Frame construction overhead: 0 (Return included in sequence) 8108 /// * Requires stack fixups? No 8109 /// 8110 /// \p MachineOutlinerNoLRSave implies that the function should be called using 8111 /// a BL instruction, but doesn't require LR to be saved and restored. This 8112 /// happens when LR is known to be dead. 8113 /// 8114 /// That is, 8115 /// 8116 /// I1 OUTLINED_FUNCTION: 8117 /// I2 --> BL OUTLINED_FUNCTION I1 8118 /// I3 I2 8119 /// I3 8120 /// RET 8121 /// 8122 /// * Call construction overhead: 1 (BL) 8123 /// * Frame construction overhead: 1 (RET) 8124 /// * Requires stack fixups? No 8125 /// 8126 /// \p MachineOutlinerThunk implies that the function is being created from 8127 /// a sequence of instructions ending in a call. The outlined function is 8128 /// called with a BL instruction, and the outlined function tail-calls the 8129 /// original call destination. 8130 /// 8131 /// That is, 8132 /// 8133 /// I1 OUTLINED_FUNCTION: 8134 /// I2 --> BL OUTLINED_FUNCTION I1 8135 /// BL f I2 8136 /// B f 8137 /// * Call construction overhead: 1 (BL) 8138 /// * Frame construction overhead: 0 8139 /// * Requires stack fixups? No 8140 /// 8141 /// \p MachineOutlinerRegSave implies that the function should be called with a 8142 /// save and restore of LR to an available register. This allows us to avoid 8143 /// stack fixups. Note that this outlining variant is compatible with the 8144 /// NoLRSave case. 8145 /// 8146 /// That is, 8147 /// 8148 /// I1 Save LR OUTLINED_FUNCTION: 8149 /// I2 --> BL OUTLINED_FUNCTION I1 8150 /// I3 Restore LR I2 8151 /// I3 8152 /// RET 8153 /// 8154 /// * Call construction overhead: 3 (save + BL + restore) 8155 /// * Frame construction overhead: 1 (ret) 8156 /// * Requires stack fixups? No 8157 enum MachineOutlinerClass { 8158 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 8159 MachineOutlinerTailCall, /// Only emit a branch. 8160 MachineOutlinerNoLRSave, /// Emit a call and return. 8161 MachineOutlinerThunk, /// Emit a call and tail-call. 8162 MachineOutlinerRegSave /// Same as default, but save to a register. 8163 }; 8164 8165 enum MachineOutlinerMBBFlags { 8166 LRUnavailableSomewhere = 0x2, 8167 HasCalls = 0x4, 8168 UnsafeRegsDead = 0x8 8169 }; 8170 8171 Register 8172 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 8173 MachineFunction *MF = C.getMF(); 8174 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 8175 const AArch64RegisterInfo *ARI = 8176 static_cast<const AArch64RegisterInfo *>(&TRI); 8177 // Check if there is an available register across the sequence that we can 8178 // use. 8179 for (unsigned Reg : AArch64::GPR64RegClass) { 8180 if (!ARI->isReservedReg(*MF, Reg) && 8181 Reg != AArch64::LR && // LR is not reserved, but don't use it. 8182 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 8183 Reg != AArch64::X17 && // Ditto for X17. 8184 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 8185 C.isAvailableInsideSeq(Reg, TRI)) 8186 return Reg; 8187 } 8188 return Register(); 8189 } 8190 8191 static bool 8192 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 8193 const outliner::Candidate &b) { 8194 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8195 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8196 8197 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 8198 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 8199 } 8200 8201 static bool 8202 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 8203 const outliner::Candidate &b) { 8204 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 8205 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 8206 8207 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 8208 } 8209 8210 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 8211 const outliner::Candidate &b) { 8212 const AArch64Subtarget &SubtargetA = 8213 a.getMF()->getSubtarget<AArch64Subtarget>(); 8214 const AArch64Subtarget &SubtargetB = 8215 b.getMF()->getSubtarget<AArch64Subtarget>(); 8216 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 8217 } 8218 8219 std::optional<outliner::OutlinedFunction> 8220 AArch64InstrInfo::getOutliningCandidateInfo( 8221 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 8222 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 8223 8224 unsigned SequenceSize = 0; 8225 for (auto &MI : FirstCand) 8226 SequenceSize += getInstSizeInBytes(MI); 8227 8228 unsigned NumBytesToCreateFrame = 0; 8229 8230 // We only allow outlining for functions having exactly matching return 8231 // address signing attributes, i.e., all share the same value for the 8232 // attribute "sign-return-address" and all share the same type of key they 8233 // are signed with. 8234 // Additionally we require all functions to simultaniously either support 8235 // v8.3a features or not. Otherwise an outlined function could get signed 8236 // using dedicated v8.3 instructions and a call from a function that doesn't 8237 // support v8.3 instructions would therefore be invalid. 8238 if (std::adjacent_find( 8239 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 8240 [](const outliner::Candidate &a, const outliner::Candidate &b) { 8241 // Return true if a and b are non-equal w.r.t. return address 8242 // signing or support of v8.3a features 8243 if (outliningCandidatesSigningScopeConsensus(a, b) && 8244 outliningCandidatesSigningKeyConsensus(a, b) && 8245 outliningCandidatesV8_3OpsConsensus(a, b)) { 8246 return false; 8247 } 8248 return true; 8249 }) != RepeatedSequenceLocs.end()) { 8250 return std::nullopt; 8251 } 8252 8253 // Since at this point all candidates agree on their return address signing 8254 // picking just one is fine. If the candidate functions potentially sign their 8255 // return addresses, the outlined function should do the same. Note that in 8256 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 8257 // not certainly true that the outlined function will have to sign its return 8258 // address but this decision is made later, when the decision to outline 8259 // has already been made. 8260 // The same holds for the number of additional instructions we need: On 8261 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 8262 // necessary. However, at this point we don't know if the outlined function 8263 // will have a RET instruction so we assume the worst. 8264 const TargetRegisterInfo &TRI = getRegisterInfo(); 8265 // Performing a tail call may require extra checks when PAuth is enabled. 8266 // If PAuth is disabled, set it to zero for uniformity. 8267 unsigned NumBytesToCheckLRInTCEpilogue = 0; 8268 if (FirstCand.getMF() 8269 ->getInfo<AArch64FunctionInfo>() 8270 ->shouldSignReturnAddress(true)) { 8271 // One PAC and one AUT instructions 8272 NumBytesToCreateFrame += 8; 8273 8274 // PAuth is enabled - set extra tail call cost, if any. 8275 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod(); 8276 NumBytesToCheckLRInTCEpilogue = 8277 AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod); 8278 // Checking the authenticated LR value may significantly impact 8279 // SequenceSize, so account for it for more precise results. 8280 if (isTailCallReturnInst(RepeatedSequenceLocs[0].back())) 8281 SequenceSize += NumBytesToCheckLRInTCEpilogue; 8282 8283 // We have to check if sp modifying instructions would get outlined. 8284 // If so we only allow outlining if sp is unchanged overall, so matching 8285 // sub and add instructions are okay to outline, all other sp modifications 8286 // are not 8287 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 8288 int SPValue = 0; 8289 for (auto &MI : C) { 8290 if (MI.modifiesRegister(AArch64::SP, &TRI)) { 8291 switch (MI.getOpcode()) { 8292 case AArch64::ADDXri: 8293 case AArch64::ADDWri: 8294 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8295 assert(MI.getOperand(2).isImm() && 8296 "Expected operand to be immediate"); 8297 assert(MI.getOperand(1).isReg() && 8298 "Expected operand to be a register"); 8299 // Check if the add just increments sp. If so, we search for 8300 // matching sub instructions that decrement sp. If not, the 8301 // modification is illegal 8302 if (MI.getOperand(1).getReg() == AArch64::SP) 8303 SPValue += MI.getOperand(2).getImm(); 8304 else 8305 return true; 8306 break; 8307 case AArch64::SUBXri: 8308 case AArch64::SUBWri: 8309 assert(MI.getNumOperands() == 4 && "Wrong number of operands"); 8310 assert(MI.getOperand(2).isImm() && 8311 "Expected operand to be immediate"); 8312 assert(MI.getOperand(1).isReg() && 8313 "Expected operand to be a register"); 8314 // Check if the sub just decrements sp. If so, we search for 8315 // matching add instructions that increment sp. If not, the 8316 // modification is illegal 8317 if (MI.getOperand(1).getReg() == AArch64::SP) 8318 SPValue -= MI.getOperand(2).getImm(); 8319 else 8320 return true; 8321 break; 8322 default: 8323 return true; 8324 } 8325 } 8326 } 8327 if (SPValue) 8328 return true; 8329 return false; 8330 }; 8331 // Remove candidates with illegal stack modifying instructions 8332 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 8333 8334 // If the sequence doesn't have enough candidates left, then we're done. 8335 if (RepeatedSequenceLocs.size() < 2) 8336 return std::nullopt; 8337 } 8338 8339 // Properties about candidate MBBs that hold for all of them. 8340 unsigned FlagsSetInAll = 0xF; 8341 8342 // Compute liveness information for each candidate, and set FlagsSetInAll. 8343 for (outliner::Candidate &C : RepeatedSequenceLocs) 8344 FlagsSetInAll &= C.Flags; 8345 8346 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode(); 8347 8348 // Helper lambda which sets call information for every candidate. 8349 auto SetCandidateCallInfo = 8350 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 8351 for (outliner::Candidate &C : RepeatedSequenceLocs) 8352 C.setCallInfo(CallID, NumBytesForCall); 8353 }; 8354 8355 unsigned FrameID = MachineOutlinerDefault; 8356 NumBytesToCreateFrame += 4; 8357 8358 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 8359 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 8360 }); 8361 8362 // We check to see if CFI Instructions are present, and if they are 8363 // we find the number of CFI Instructions in the candidates. 8364 unsigned CFICount = 0; 8365 for (auto &I : RepeatedSequenceLocs[0]) { 8366 if (I.isCFIInstruction()) 8367 CFICount++; 8368 } 8369 8370 // We compare the number of found CFI Instructions to the number of CFI 8371 // instructions in the parent function for each candidate. We must check this 8372 // since if we outline one of the CFI instructions in a function, we have to 8373 // outline them all for correctness. If we do not, the address offsets will be 8374 // incorrect between the two sections of the program. 8375 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8376 std::vector<MCCFIInstruction> CFIInstructions = 8377 C.getMF()->getFrameInstructions(); 8378 8379 if (CFICount > 0 && CFICount != CFIInstructions.size()) 8380 return std::nullopt; 8381 } 8382 8383 // Returns true if an instructions is safe to fix up, false otherwise. 8384 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 8385 if (MI.isCall()) 8386 return true; 8387 8388 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 8389 !MI.readsRegister(AArch64::SP, &TRI)) 8390 return true; 8391 8392 // Any modification of SP will break our code to save/restore LR. 8393 // FIXME: We could handle some instructions which add a constant 8394 // offset to SP, with a bit more work. 8395 if (MI.modifiesRegister(AArch64::SP, &TRI)) 8396 return false; 8397 8398 // At this point, we have a stack instruction that we might need to 8399 // fix up. We'll handle it if it's a load or store. 8400 if (MI.mayLoadOrStore()) { 8401 const MachineOperand *Base; // Filled with the base operand of MI. 8402 int64_t Offset; // Filled with the offset of MI. 8403 bool OffsetIsScalable; 8404 8405 // Does it allow us to offset the base operand and is the base the 8406 // register SP? 8407 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 8408 !Base->isReg() || Base->getReg() != AArch64::SP) 8409 return false; 8410 8411 // Fixe-up code below assumes bytes. 8412 if (OffsetIsScalable) 8413 return false; 8414 8415 // Find the minimum/maximum offset for this instruction and check 8416 // if fixing it up would be in range. 8417 int64_t MinOffset, 8418 MaxOffset; // Unscaled offsets for the instruction. 8419 // The scale to multiply the offsets by. 8420 TypeSize Scale(0U, false), DummyWidth(0U, false); 8421 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 8422 8423 Offset += 16; // Update the offset to what it would be if we outlined. 8424 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 8425 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 8426 return false; 8427 8428 // It's in range, so we can outline it. 8429 return true; 8430 } 8431 8432 // FIXME: Add handling for instructions like "add x0, sp, #8". 8433 8434 // We can't fix it up, so don't outline it. 8435 return false; 8436 }; 8437 8438 // True if it's possible to fix up each stack instruction in this sequence. 8439 // Important for frames/call variants that modify the stack. 8440 bool AllStackInstrsSafe = llvm::all_of(FirstCand, IsSafeToFixup); 8441 8442 // If the last instruction in any candidate is a terminator, then we should 8443 // tail call all of the candidates. 8444 if (RepeatedSequenceLocs[0].back().isTerminator()) { 8445 FrameID = MachineOutlinerTailCall; 8446 NumBytesToCreateFrame = 0; 8447 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue; 8448 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall); 8449 } 8450 8451 else if (LastInstrOpcode == AArch64::BL || 8452 ((LastInstrOpcode == AArch64::BLR || 8453 LastInstrOpcode == AArch64::BLRNoIP) && 8454 !HasBTI)) { 8455 // FIXME: Do we need to check if the code after this uses the value of LR? 8456 FrameID = MachineOutlinerThunk; 8457 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue; 8458 SetCandidateCallInfo(MachineOutlinerThunk, 4); 8459 } 8460 8461 else { 8462 // We need to decide how to emit calls + frames. We can always emit the same 8463 // frame if we don't need to save to the stack. If we have to save to the 8464 // stack, then we need a different frame. 8465 unsigned NumBytesNoStackCalls = 0; 8466 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 8467 8468 // Check if we have to save LR. 8469 for (outliner::Candidate &C : RepeatedSequenceLocs) { 8470 bool LRAvailable = 8471 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 8472 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 8473 : true; 8474 // If we have a noreturn caller, then we're going to be conservative and 8475 // say that we have to save LR. If we don't have a ret at the end of the 8476 // block, then we can't reason about liveness accurately. 8477 // 8478 // FIXME: We can probably do better than always disabling this in 8479 // noreturn functions by fixing up the liveness info. 8480 bool IsNoReturn = 8481 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 8482 8483 // Is LR available? If so, we don't need a save. 8484 if (LRAvailable && !IsNoReturn) { 8485 NumBytesNoStackCalls += 4; 8486 C.setCallInfo(MachineOutlinerNoLRSave, 4); 8487 CandidatesWithoutStackFixups.push_back(C); 8488 } 8489 8490 // Is an unused register available? If so, we won't modify the stack, so 8491 // we can outline with the same frame type as those that don't save LR. 8492 else if (findRegisterToSaveLRTo(C)) { 8493 NumBytesNoStackCalls += 12; 8494 C.setCallInfo(MachineOutlinerRegSave, 12); 8495 CandidatesWithoutStackFixups.push_back(C); 8496 } 8497 8498 // Is SP used in the sequence at all? If not, we don't have to modify 8499 // the stack, so we are guaranteed to get the same frame. 8500 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 8501 NumBytesNoStackCalls += 12; 8502 C.setCallInfo(MachineOutlinerDefault, 12); 8503 CandidatesWithoutStackFixups.push_back(C); 8504 } 8505 8506 // If we outline this, we need to modify the stack. Pretend we don't 8507 // outline this by saving all of its bytes. 8508 else { 8509 NumBytesNoStackCalls += SequenceSize; 8510 } 8511 } 8512 8513 // If there are no places where we have to save LR, then note that we 8514 // don't have to update the stack. Otherwise, give every candidate the 8515 // default call type, as long as it's safe to do so. 8516 if (!AllStackInstrsSafe || 8517 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 8518 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 8519 FrameID = MachineOutlinerNoLRSave; 8520 } else { 8521 SetCandidateCallInfo(MachineOutlinerDefault, 12); 8522 8523 // Bugzilla ID: 46767 8524 // TODO: Check if fixing up the stack more than once is safe so we can 8525 // outline these. 8526 // 8527 // An outline resulting in a caller that requires stack fixups at the 8528 // callsite to a callee that also requires stack fixups can happen when 8529 // there are no available registers at the candidate callsite for a 8530 // candidate that itself also has calls. 8531 // 8532 // In other words if function_containing_sequence in the following pseudo 8533 // assembly requires that we save LR at the point of the call, but there 8534 // are no available registers: in this case we save using SP and as a 8535 // result the SP offsets requires stack fixups by multiples of 16. 8536 // 8537 // function_containing_sequence: 8538 // ... 8539 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8540 // call OUTLINED_FUNCTION_N 8541 // restore LR from SP 8542 // ... 8543 // 8544 // OUTLINED_FUNCTION_N: 8545 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 8546 // ... 8547 // bl foo 8548 // restore LR from SP 8549 // ret 8550 // 8551 // Because the code to handle more than one stack fixup does not 8552 // currently have the proper checks for legality, these cases will assert 8553 // in the AArch64 MachineOutliner. This is because the code to do this 8554 // needs more hardening, testing, better checks that generated code is 8555 // legal, etc and because it is only verified to handle a single pass of 8556 // stack fixup. 8557 // 8558 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 8559 // these cases until they are known to be handled. Bugzilla 46767 is 8560 // referenced in comments at the assert site. 8561 // 8562 // To avoid asserting (or generating non-legal code on noassert builds) 8563 // we remove all candidates which would need more than one stack fixup by 8564 // pruning the cases where the candidate has calls while also having no 8565 // available LR and having no available general purpose registers to copy 8566 // LR to (ie one extra stack save/restore). 8567 // 8568 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8569 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 8570 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); }; 8571 return (llvm::any_of(C, IsCall)) && 8572 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 8573 !findRegisterToSaveLRTo(C)); 8574 }); 8575 } 8576 } 8577 8578 // If we dropped all of the candidates, bail out here. 8579 if (RepeatedSequenceLocs.size() < 2) { 8580 RepeatedSequenceLocs.clear(); 8581 return std::nullopt; 8582 } 8583 } 8584 8585 // Does every candidate's MBB contain a call? If so, then we might have a call 8586 // in the range. 8587 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 8588 // Check if the range contains a call. These require a save + restore of the 8589 // link register. 8590 bool ModStackToSaveLR = false; 8591 if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()), 8592 [](const MachineInstr &MI) { return MI.isCall(); })) 8593 ModStackToSaveLR = true; 8594 8595 // Handle the last instruction separately. If this is a tail call, then the 8596 // last instruction is a call. We don't want to save + restore in this case. 8597 // However, it could be possible that the last instruction is a call without 8598 // it being valid to tail call this sequence. We should consider this as 8599 // well. 8600 else if (FrameID != MachineOutlinerThunk && 8601 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall()) 8602 ModStackToSaveLR = true; 8603 8604 if (ModStackToSaveLR) { 8605 // We can't fix up the stack. Bail out. 8606 if (!AllStackInstrsSafe) { 8607 RepeatedSequenceLocs.clear(); 8608 return std::nullopt; 8609 } 8610 8611 // Save + restore LR. 8612 NumBytesToCreateFrame += 8; 8613 } 8614 } 8615 8616 // If we have CFI instructions, we can only outline if the outlined section 8617 // can be a tail call 8618 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 8619 return std::nullopt; 8620 8621 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 8622 NumBytesToCreateFrame, FrameID); 8623 } 8624 8625 void AArch64InstrInfo::mergeOutliningCandidateAttributes( 8626 Function &F, std::vector<outliner::Candidate> &Candidates) const { 8627 // If a bunch of candidates reach this point they must agree on their return 8628 // address signing. It is therefore enough to just consider the signing 8629 // behaviour of one of them 8630 const auto &CFn = Candidates.front().getMF()->getFunction(); 8631 8632 // Since all candidates belong to the same module, just copy the 8633 // function-level attributes of an arbitrary function. 8634 if (CFn.hasFnAttribute("sign-return-address")) 8635 F.addFnAttr(CFn.getFnAttribute("sign-return-address")); 8636 if (CFn.hasFnAttribute("sign-return-address-key")) 8637 F.addFnAttr(CFn.getFnAttribute("sign-return-address-key")); 8638 8639 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates); 8640 } 8641 8642 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 8643 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 8644 const Function &F = MF.getFunction(); 8645 8646 // Can F be deduplicated by the linker? If it can, don't outline from it. 8647 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 8648 return false; 8649 8650 // Don't outline from functions with section markings; the program could 8651 // expect that all the code is in the named section. 8652 // FIXME: Allow outlining from multiple functions with the same section 8653 // marking. 8654 if (F.hasSection()) 8655 return false; 8656 8657 // Outlining from functions with redzones is unsafe since the outliner may 8658 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 8659 // outline from it. 8660 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 8661 if (!AFI || AFI->hasRedZone().value_or(true)) 8662 return false; 8663 8664 // FIXME: Teach the outliner to generate/handle Windows unwind info. 8665 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 8666 return false; 8667 8668 // It's safe to outline from MF. 8669 return true; 8670 } 8671 8672 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8673 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 8674 unsigned &Flags) const { 8675 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 8676 "Must track liveness!"); 8677 SmallVector< 8678 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 8679 Ranges; 8680 // According to the AArch64 Procedure Call Standard, the following are 8681 // undefined on entry/exit from a function call: 8682 // 8683 // * Registers x16, x17, (and thus w16, w17) 8684 // * Condition codes (and thus the NZCV register) 8685 // 8686 // If any of these registers are used inside or live across an outlined 8687 // function, then they may be modified later, either by the compiler or 8688 // some other tool (like the linker). 8689 // 8690 // To avoid outlining in these situations, partition each block into ranges 8691 // where these registers are dead. We will only outline from those ranges. 8692 LiveRegUnits LRU(getRegisterInfo()); 8693 auto AreAllUnsafeRegsDead = [&LRU]() { 8694 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 8695 LRU.available(AArch64::NZCV); 8696 }; 8697 8698 // We need to know if LR is live across an outlining boundary later on in 8699 // order to decide how we'll create the outlined call, frame, etc. 8700 // 8701 // It's pretty expensive to check this for *every candidate* within a block. 8702 // That's some potentially n^2 behaviour, since in the worst case, we'd need 8703 // to compute liveness from the end of the block for O(n) candidates within 8704 // the block. 8705 // 8706 // So, to improve the average case, let's keep track of liveness from the end 8707 // of the block to the beginning of *every outlinable range*. If we know that 8708 // LR is available in every range we could outline from, then we know that 8709 // we don't need to check liveness for any candidate within that range. 8710 bool LRAvailableEverywhere = true; 8711 // Compute liveness bottom-up. 8712 LRU.addLiveOuts(MBB); 8713 // Update flags that require info about the entire MBB. 8714 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 8715 if (MI.isCall() && !MI.isTerminator()) 8716 Flags |= MachineOutlinerMBBFlags::HasCalls; 8717 }; 8718 // Range: [RangeBegin, RangeEnd) 8719 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 8720 unsigned RangeLen; 8721 auto CreateNewRangeStartingAt = 8722 [&RangeBegin, &RangeEnd, 8723 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 8724 RangeBegin = NewBegin; 8725 RangeEnd = std::next(RangeBegin); 8726 RangeLen = 0; 8727 }; 8728 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 8729 // At least one unsafe register is not dead. We do not want to outline at 8730 // this point. If it is long enough to outline from, save the range 8731 // [RangeBegin, RangeEnd). 8732 if (RangeLen > 1) 8733 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 8734 }; 8735 // Find the first point where all unsafe registers are dead. 8736 // FIND: <safe instr> <-- end of first potential range 8737 // SKIP: <unsafe def> 8738 // SKIP: ... everything between ... 8739 // SKIP: <unsafe use> 8740 auto FirstPossibleEndPt = MBB.instr_rbegin(); 8741 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 8742 LRU.stepBackward(*FirstPossibleEndPt); 8743 // Update flags that impact how we outline across the entire block, 8744 // regardless of safety. 8745 UpdateWholeMBBFlags(*FirstPossibleEndPt); 8746 if (AreAllUnsafeRegsDead()) 8747 break; 8748 } 8749 // If we exhausted the entire block, we have no safe ranges to outline. 8750 if (FirstPossibleEndPt == MBB.instr_rend()) 8751 return Ranges; 8752 // Current range. 8753 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 8754 // StartPt points to the first place where all unsafe registers 8755 // are dead (if there is any such point). Begin partitioning the MBB into 8756 // ranges. 8757 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 8758 LRU.stepBackward(MI); 8759 UpdateWholeMBBFlags(MI); 8760 if (!AreAllUnsafeRegsDead()) { 8761 SaveRangeIfNonEmpty(); 8762 CreateNewRangeStartingAt(MI.getIterator()); 8763 continue; 8764 } 8765 LRAvailableEverywhere &= LRU.available(AArch64::LR); 8766 RangeBegin = MI.getIterator(); 8767 ++RangeLen; 8768 } 8769 // Above loop misses the last (or only) range. If we are still safe, then 8770 // let's save the range. 8771 if (AreAllUnsafeRegsDead()) 8772 SaveRangeIfNonEmpty(); 8773 if (Ranges.empty()) 8774 return Ranges; 8775 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 8776 // the order. 8777 std::reverse(Ranges.begin(), Ranges.end()); 8778 // If there is at least one outlinable range where LR is unavailable 8779 // somewhere, remember that. 8780 if (!LRAvailableEverywhere) 8781 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 8782 return Ranges; 8783 } 8784 8785 outliner::InstrType 8786 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 8787 unsigned Flags) const { 8788 MachineInstr &MI = *MIT; 8789 MachineBasicBlock *MBB = MI.getParent(); 8790 MachineFunction *MF = MBB->getParent(); 8791 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 8792 8793 // Don't outline anything used for return address signing. The outlined 8794 // function will get signed later if needed 8795 switch (MI.getOpcode()) { 8796 case AArch64::PACM: 8797 case AArch64::PACIASP: 8798 case AArch64::PACIBSP: 8799 case AArch64::PACIASPPC: 8800 case AArch64::PACIBSPPC: 8801 case AArch64::AUTIASP: 8802 case AArch64::AUTIBSP: 8803 case AArch64::AUTIASPPCi: 8804 case AArch64::AUTIASPPCr: 8805 case AArch64::AUTIBSPPCi: 8806 case AArch64::AUTIBSPPCr: 8807 case AArch64::RETAA: 8808 case AArch64::RETAB: 8809 case AArch64::RETAASPPCi: 8810 case AArch64::RETAASPPCr: 8811 case AArch64::RETABSPPCi: 8812 case AArch64::RETABSPPCr: 8813 case AArch64::EMITBKEY: 8814 case AArch64::PAUTH_PROLOGUE: 8815 case AArch64::PAUTH_EPILOGUE: 8816 return outliner::InstrType::Illegal; 8817 } 8818 8819 // Don't outline LOHs. 8820 if (FuncInfo->getLOHRelated().count(&MI)) 8821 return outliner::InstrType::Illegal; 8822 8823 // We can only outline these if we will tail call the outlined function, or 8824 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 8825 // in a tail call. 8826 // 8827 // FIXME: If the proper fixups for the offset are implemented, this should be 8828 // possible. 8829 if (MI.isCFIInstruction()) 8830 return outliner::InstrType::Legal; 8831 8832 // Is this a terminator for a basic block? 8833 if (MI.isTerminator()) 8834 // TargetInstrInfo::getOutliningType has already filtered out anything 8835 // that would break this, so we can allow it here. 8836 return outliner::InstrType::Legal; 8837 8838 // Make sure none of the operands are un-outlinable. 8839 for (const MachineOperand &MOP : MI.operands()) { 8840 // A check preventing CFI indices was here before, but only CFI 8841 // instructions should have those. 8842 assert(!MOP.isCFIIndex()); 8843 8844 // If it uses LR or W30 explicitly, then don't touch it. 8845 if (MOP.isReg() && !MOP.isImplicit() && 8846 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 8847 return outliner::InstrType::Illegal; 8848 } 8849 8850 // Special cases for instructions that can always be outlined, but will fail 8851 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 8852 // be outlined because they don't require a *specific* value to be in LR. 8853 if (MI.getOpcode() == AArch64::ADRP) 8854 return outliner::InstrType::Legal; 8855 8856 // If MI is a call we might be able to outline it. We don't want to outline 8857 // any calls that rely on the position of items on the stack. When we outline 8858 // something containing a call, we have to emit a save and restore of LR in 8859 // the outlined function. Currently, this always happens by saving LR to the 8860 // stack. Thus, if we outline, say, half the parameters for a function call 8861 // plus the call, then we'll break the callee's expectations for the layout 8862 // of the stack. 8863 // 8864 // FIXME: Allow calls to functions which construct a stack frame, as long 8865 // as they don't access arguments on the stack. 8866 // FIXME: Figure out some way to analyze functions defined in other modules. 8867 // We should be able to compute the memory usage based on the IR calling 8868 // convention, even if we can't see the definition. 8869 if (MI.isCall()) { 8870 // Get the function associated with the call. Look at each operand and find 8871 // the one that represents the callee and get its name. 8872 const Function *Callee = nullptr; 8873 for (const MachineOperand &MOP : MI.operands()) { 8874 if (MOP.isGlobal()) { 8875 Callee = dyn_cast<Function>(MOP.getGlobal()); 8876 break; 8877 } 8878 } 8879 8880 // Never outline calls to mcount. There isn't any rule that would require 8881 // this, but the Linux kernel's "ftrace" feature depends on it. 8882 if (Callee && Callee->getName() == "\01_mcount") 8883 return outliner::InstrType::Illegal; 8884 8885 // If we don't know anything about the callee, assume it depends on the 8886 // stack layout of the caller. In that case, it's only legal to outline 8887 // as a tail-call. Explicitly list the call instructions we know about so we 8888 // don't get unexpected results with call pseudo-instructions. 8889 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 8890 if (MI.getOpcode() == AArch64::BLR || 8891 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 8892 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 8893 8894 if (!Callee) 8895 return UnknownCallOutlineType; 8896 8897 // We have a function we have information about. Check it if it's something 8898 // can safely outline. 8899 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 8900 8901 // We don't know what's going on with the callee at all. Don't touch it. 8902 if (!CalleeMF) 8903 return UnknownCallOutlineType; 8904 8905 // Check if we know anything about the callee saves on the function. If we 8906 // don't, then don't touch it, since that implies that we haven't 8907 // computed anything about its stack frame yet. 8908 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 8909 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 8910 MFI.getNumObjects() > 0) 8911 return UnknownCallOutlineType; 8912 8913 // At this point, we can say that CalleeMF ought to not pass anything on the 8914 // stack. Therefore, we can outline it. 8915 return outliner::InstrType::Legal; 8916 } 8917 8918 // Don't touch the link register or W30. 8919 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 8920 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 8921 return outliner::InstrType::Illegal; 8922 8923 // Don't outline BTI instructions, because that will prevent the outlining 8924 // site from being indirectly callable. 8925 if (hasBTISemantics(MI)) 8926 return outliner::InstrType::Illegal; 8927 8928 return outliner::InstrType::Legal; 8929 } 8930 8931 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 8932 for (MachineInstr &MI : MBB) { 8933 const MachineOperand *Base; 8934 TypeSize Width(0, false); 8935 int64_t Offset; 8936 bool OffsetIsScalable; 8937 8938 // Is this a load or store with an immediate offset with SP as the base? 8939 if (!MI.mayLoadOrStore() || 8940 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 8941 &RI) || 8942 (Base->isReg() && Base->getReg() != AArch64::SP)) 8943 continue; 8944 8945 // It is, so we have to fix it up. 8946 TypeSize Scale(0U, false); 8947 int64_t Dummy1, Dummy2; 8948 8949 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 8950 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 8951 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 8952 assert(Scale != 0 && "Unexpected opcode!"); 8953 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 8954 8955 // We've pushed the return address to the stack, so add 16 to the offset. 8956 // This is safe, since we already checked if it would overflow when we 8957 // checked if this instruction was legal to outline. 8958 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 8959 StackOffsetOperand.setImm(NewImm); 8960 } 8961 } 8962 8963 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 8964 const AArch64InstrInfo *TII, 8965 bool ShouldSignReturnAddr) { 8966 if (!ShouldSignReturnAddr) 8967 return; 8968 8969 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE)) 8970 .setMIFlag(MachineInstr::FrameSetup); 8971 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(), 8972 TII->get(AArch64::PAUTH_EPILOGUE)) 8973 .setMIFlag(MachineInstr::FrameDestroy); 8974 } 8975 8976 void AArch64InstrInfo::buildOutlinedFrame( 8977 MachineBasicBlock &MBB, MachineFunction &MF, 8978 const outliner::OutlinedFunction &OF) const { 8979 8980 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 8981 8982 if (OF.FrameConstructionID == MachineOutlinerTailCall) 8983 FI->setOutliningStyle("Tail Call"); 8984 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 8985 // For thunk outlining, rewrite the last instruction from a call to a 8986 // tail-call. 8987 MachineInstr *Call = &*--MBB.instr_end(); 8988 unsigned TailOpcode; 8989 if (Call->getOpcode() == AArch64::BL) { 8990 TailOpcode = AArch64::TCRETURNdi; 8991 } else { 8992 assert(Call->getOpcode() == AArch64::BLR || 8993 Call->getOpcode() == AArch64::BLRNoIP); 8994 TailOpcode = AArch64::TCRETURNriALL; 8995 } 8996 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 8997 .add(Call->getOperand(0)) 8998 .addImm(0); 8999 MBB.insert(MBB.end(), TC); 9000 Call->eraseFromParent(); 9001 9002 FI->setOutliningStyle("Thunk"); 9003 } 9004 9005 bool IsLeafFunction = true; 9006 9007 // Is there a call in the outlined range? 9008 auto IsNonTailCall = [](const MachineInstr &MI) { 9009 return MI.isCall() && !MI.isReturn(); 9010 }; 9011 9012 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 9013 // Fix up the instructions in the range, since we're going to modify the 9014 // stack. 9015 9016 // Bugzilla ID: 46767 9017 // TODO: Check if fixing up twice is safe so we can outline these. 9018 assert(OF.FrameConstructionID != MachineOutlinerDefault && 9019 "Can only fix up stack references once"); 9020 fixupPostOutline(MBB); 9021 9022 IsLeafFunction = false; 9023 9024 // LR has to be a live in so that we can save it. 9025 if (!MBB.isLiveIn(AArch64::LR)) 9026 MBB.addLiveIn(AArch64::LR); 9027 9028 MachineBasicBlock::iterator It = MBB.begin(); 9029 MachineBasicBlock::iterator Et = MBB.end(); 9030 9031 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9032 OF.FrameConstructionID == MachineOutlinerThunk) 9033 Et = std::prev(MBB.end()); 9034 9035 // Insert a save before the outlined region 9036 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9037 .addReg(AArch64::SP, RegState::Define) 9038 .addReg(AArch64::LR) 9039 .addReg(AArch64::SP) 9040 .addImm(-16); 9041 It = MBB.insert(It, STRXpre); 9042 9043 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 9044 const TargetSubtargetInfo &STI = MF.getSubtarget(); 9045 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 9046 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 9047 9048 // Add a CFI saying the stack was moved 16 B down. 9049 int64_t StackPosEntry = 9050 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 9051 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9052 .addCFIIndex(StackPosEntry) 9053 .setMIFlags(MachineInstr::FrameSetup); 9054 9055 // Add a CFI saying that the LR that we want to find is now 16 B higher 9056 // than before. 9057 int64_t LRPosEntry = MF.addFrameInst( 9058 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 9059 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 9060 .addCFIIndex(LRPosEntry) 9061 .setMIFlags(MachineInstr::FrameSetup); 9062 } 9063 9064 // Insert a restore before the terminator for the function. 9065 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9066 .addReg(AArch64::SP, RegState::Define) 9067 .addReg(AArch64::LR, RegState::Define) 9068 .addReg(AArch64::SP) 9069 .addImm(16); 9070 Et = MBB.insert(Et, LDRXpost); 9071 } 9072 9073 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(!IsLeafFunction); 9074 9075 // If this is a tail call outlined function, then there's already a return. 9076 if (OF.FrameConstructionID == MachineOutlinerTailCall || 9077 OF.FrameConstructionID == MachineOutlinerThunk) { 9078 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9079 return; 9080 } 9081 9082 // It's not a tail call, so we have to insert the return ourselves. 9083 9084 // LR has to be a live in so that we can return to it. 9085 if (!MBB.isLiveIn(AArch64::LR)) 9086 MBB.addLiveIn(AArch64::LR); 9087 9088 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 9089 .addReg(AArch64::LR); 9090 MBB.insert(MBB.end(), ret); 9091 9092 signOutlinedFunction(MF, MBB, this, ShouldSignReturnAddr); 9093 9094 FI->setOutliningStyle("Function"); 9095 9096 // Did we have to modify the stack by saving the link register? 9097 if (OF.FrameConstructionID != MachineOutlinerDefault) 9098 return; 9099 9100 // We modified the stack. 9101 // Walk over the basic block and fix up all the stack accesses. 9102 fixupPostOutline(MBB); 9103 } 9104 9105 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 9106 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 9107 MachineFunction &MF, outliner::Candidate &C) const { 9108 9109 // Are we tail calling? 9110 if (C.CallConstructionID == MachineOutlinerTailCall) { 9111 // If yes, then we can just branch to the label. 9112 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 9113 .addGlobalAddress(M.getNamedValue(MF.getName())) 9114 .addImm(0)); 9115 return It; 9116 } 9117 9118 // Are we saving the link register? 9119 if (C.CallConstructionID == MachineOutlinerNoLRSave || 9120 C.CallConstructionID == MachineOutlinerThunk) { 9121 // No, so just insert the call. 9122 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9123 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9124 return It; 9125 } 9126 9127 // We want to return the spot where we inserted the call. 9128 MachineBasicBlock::iterator CallPt; 9129 9130 // Instructions for saving and restoring LR around the call instruction we're 9131 // going to insert. 9132 MachineInstr *Save; 9133 MachineInstr *Restore; 9134 // Can we save to a register? 9135 if (C.CallConstructionID == MachineOutlinerRegSave) { 9136 // FIXME: This logic should be sunk into a target-specific interface so that 9137 // we don't have to recompute the register. 9138 Register Reg = findRegisterToSaveLRTo(C); 9139 assert(Reg && "No callee-saved register available?"); 9140 9141 // LR has to be a live in so that we can save it. 9142 if (!MBB.isLiveIn(AArch64::LR)) 9143 MBB.addLiveIn(AArch64::LR); 9144 9145 // Save and restore LR from Reg. 9146 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 9147 .addReg(AArch64::XZR) 9148 .addReg(AArch64::LR) 9149 .addImm(0); 9150 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 9151 .addReg(AArch64::XZR) 9152 .addReg(Reg) 9153 .addImm(0); 9154 } else { 9155 // We have the default case. Save and restore from SP. 9156 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 9157 .addReg(AArch64::SP, RegState::Define) 9158 .addReg(AArch64::LR) 9159 .addReg(AArch64::SP) 9160 .addImm(-16); 9161 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 9162 .addReg(AArch64::SP, RegState::Define) 9163 .addReg(AArch64::LR, RegState::Define) 9164 .addReg(AArch64::SP) 9165 .addImm(16); 9166 } 9167 9168 It = MBB.insert(It, Save); 9169 It++; 9170 9171 // Insert the call. 9172 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 9173 .addGlobalAddress(M.getNamedValue(MF.getName()))); 9174 CallPt = It; 9175 It++; 9176 9177 It = MBB.insert(It, Restore); 9178 return CallPt; 9179 } 9180 9181 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 9182 MachineFunction &MF) const { 9183 return MF.getFunction().hasMinSize(); 9184 } 9185 9186 void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB, 9187 MachineBasicBlock::iterator Iter, 9188 DebugLoc &DL, 9189 bool AllowSideEffects) const { 9190 const MachineFunction &MF = *MBB.getParent(); 9191 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>(); 9192 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo(); 9193 9194 if (TRI.isGeneralPurposeRegister(MF, Reg)) { 9195 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0); 9196 } else if (STI.hasSVE()) { 9197 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg) 9198 .addImm(0) 9199 .addImm(0); 9200 } else { 9201 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg) 9202 .addImm(0); 9203 } 9204 } 9205 9206 std::optional<DestSourcePair> 9207 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 9208 9209 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 9210 // and zero immediate operands used as an alias for mov instruction. 9211 if (MI.getOpcode() == AArch64::ORRWrs && 9212 MI.getOperand(1).getReg() == AArch64::WZR && 9213 MI.getOperand(3).getImm() == 0x0 && 9214 // Check that the w->w move is not a zero-extending w->x mov. 9215 (!MI.getOperand(0).getReg().isVirtual() || 9216 MI.getOperand(0).getSubReg() == 0) && 9217 (!MI.getOperand(0).getReg().isPhysical() || 9218 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 + 9219 AArch64::X0) == -1)) 9220 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9221 9222 if (MI.getOpcode() == AArch64::ORRXrs && 9223 MI.getOperand(1).getReg() == AArch64::XZR && 9224 MI.getOperand(3).getImm() == 0x0) 9225 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9226 9227 return std::nullopt; 9228 } 9229 9230 std::optional<DestSourcePair> 9231 AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const { 9232 if (MI.getOpcode() == AArch64::ORRWrs && 9233 MI.getOperand(1).getReg() == AArch64::WZR && 9234 MI.getOperand(3).getImm() == 0x0) 9235 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 9236 return std::nullopt; 9237 } 9238 9239 std::optional<RegImmPair> 9240 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 9241 int Sign = 1; 9242 int64_t Offset = 0; 9243 9244 // TODO: Handle cases where Reg is a super- or sub-register of the 9245 // destination register. 9246 const MachineOperand &Op0 = MI.getOperand(0); 9247 if (!Op0.isReg() || Reg != Op0.getReg()) 9248 return std::nullopt; 9249 9250 switch (MI.getOpcode()) { 9251 default: 9252 return std::nullopt; 9253 case AArch64::SUBWri: 9254 case AArch64::SUBXri: 9255 case AArch64::SUBSWri: 9256 case AArch64::SUBSXri: 9257 Sign *= -1; 9258 [[fallthrough]]; 9259 case AArch64::ADDSWri: 9260 case AArch64::ADDSXri: 9261 case AArch64::ADDWri: 9262 case AArch64::ADDXri: { 9263 // TODO: Third operand can be global address (usually some string). 9264 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 9265 !MI.getOperand(2).isImm()) 9266 return std::nullopt; 9267 int Shift = MI.getOperand(3).getImm(); 9268 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 9269 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 9270 } 9271 } 9272 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 9273 } 9274 9275 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 9276 /// the destination register then, if possible, describe the value in terms of 9277 /// the source register. 9278 static std::optional<ParamLoadedValue> 9279 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 9280 const TargetInstrInfo *TII, 9281 const TargetRegisterInfo *TRI) { 9282 auto DestSrc = TII->isCopyLikeInstr(MI); 9283 if (!DestSrc) 9284 return std::nullopt; 9285 9286 Register DestReg = DestSrc->Destination->getReg(); 9287 Register SrcReg = DestSrc->Source->getReg(); 9288 9289 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 9290 9291 // If the described register is the destination, just return the source. 9292 if (DestReg == DescribedReg) 9293 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9294 9295 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 9296 if (MI.getOpcode() == AArch64::ORRWrs && 9297 TRI->isSuperRegister(DestReg, DescribedReg)) 9298 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 9299 9300 // We may need to describe the lower part of a ORRXrs move. 9301 if (MI.getOpcode() == AArch64::ORRXrs && 9302 TRI->isSubRegister(DestReg, DescribedReg)) { 9303 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 9304 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 9305 } 9306 9307 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 9308 "Unhandled ORR[XW]rs copy case"); 9309 9310 return std::nullopt; 9311 } 9312 9313 bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const { 9314 // Functions cannot be split to different sections on AArch64 if they have 9315 // a red zone. This is because relaxing a cross-section branch may require 9316 // incrementing the stack pointer to spill a register, which would overwrite 9317 // the red zone. 9318 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(true)) 9319 return false; 9320 9321 return TargetInstrInfo::isFunctionSafeToSplit(MF); 9322 } 9323 9324 bool AArch64InstrInfo::isMBBSafeToSplitToCold( 9325 const MachineBasicBlock &MBB) const { 9326 // Asm Goto blocks can contain conditional branches to goto labels, which can 9327 // get moved out of range of the branch instruction. 9328 auto isAsmGoto = [](const MachineInstr &MI) { 9329 return MI.getOpcode() == AArch64::INLINEASM_BR; 9330 }; 9331 if (llvm::any_of(MBB, isAsmGoto) || MBB.isInlineAsmBrIndirectTarget()) 9332 return false; 9333 9334 // Because jump tables are label-relative instead of table-relative, they all 9335 // must be in the same section or relocation fixup handling will fail. 9336 9337 // Check if MBB is a jump table target 9338 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo(); 9339 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) { 9340 return llvm::is_contained(JTE.MBBs, &MBB); 9341 }; 9342 if (MJTI != nullptr && llvm::any_of(MJTI->getJumpTables(), containsMBB)) 9343 return false; 9344 9345 // Check if MBB contains a jump table lookup 9346 for (const MachineInstr &MI : MBB) { 9347 switch (MI.getOpcode()) { 9348 case TargetOpcode::G_BRJT: 9349 case AArch64::JumpTableDest32: 9350 case AArch64::JumpTableDest16: 9351 case AArch64::JumpTableDest8: 9352 return false; 9353 default: 9354 continue; 9355 } 9356 } 9357 9358 // MBB isn't a special case, so it's safe to be split to the cold section. 9359 return true; 9360 } 9361 9362 std::optional<ParamLoadedValue> 9363 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 9364 Register Reg) const { 9365 const MachineFunction *MF = MI.getMF(); 9366 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 9367 switch (MI.getOpcode()) { 9368 case AArch64::MOVZWi: 9369 case AArch64::MOVZXi: { 9370 // MOVZWi may be used for producing zero-extended 32-bit immediates in 9371 // 64-bit parameters, so we need to consider super-registers. 9372 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 9373 return std::nullopt; 9374 9375 if (!MI.getOperand(1).isImm()) 9376 return std::nullopt; 9377 int64_t Immediate = MI.getOperand(1).getImm(); 9378 int Shift = MI.getOperand(2).getImm(); 9379 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 9380 nullptr); 9381 } 9382 case AArch64::ORRWrs: 9383 case AArch64::ORRXrs: 9384 return describeORRLoadedValue(MI, Reg, this, TRI); 9385 } 9386 9387 return TargetInstrInfo::describeLoadedValue(MI, Reg); 9388 } 9389 9390 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 9391 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 9392 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 9393 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 9394 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 9395 9396 // Anyexts are nops. 9397 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 9398 return true; 9399 9400 Register DefReg = ExtMI.getOperand(0).getReg(); 9401 if (!MRI.hasOneNonDBGUse(DefReg)) 9402 return false; 9403 9404 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 9405 // addressing mode. 9406 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 9407 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 9408 } 9409 9410 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 9411 return get(Opc).TSFlags & AArch64::ElementSizeMask; 9412 } 9413 9414 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 9415 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 9416 } 9417 9418 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 9419 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 9420 } 9421 9422 unsigned int 9423 AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const { 9424 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2; 9425 } 9426 9427 bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset, 9428 unsigned Scale) const { 9429 if (Offset && Scale) 9430 return false; 9431 9432 // Check Reg + Imm 9433 if (!Scale) { 9434 // 9-bit signed offset 9435 if (isInt<9>(Offset)) 9436 return true; 9437 9438 // 12-bit unsigned offset 9439 unsigned Shift = Log2_64(NumBytes); 9440 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 9441 // Must be a multiple of NumBytes (NumBytes is a power of 2) 9442 (Offset >> Shift) << Shift == Offset) 9443 return true; 9444 return false; 9445 } 9446 9447 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 9448 return Scale == 1 || (Scale > 0 && Scale == NumBytes); 9449 } 9450 9451 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 9452 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 9453 return AArch64::BLRNoIP; 9454 else 9455 return AArch64::BLR; 9456 } 9457 9458 bool AArch64InstrInfo::isReallyTriviallyReMaterializable( 9459 const MachineInstr &MI) const { 9460 const MachineFunction &MF = *MI.getMF(); 9461 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>(); 9462 9463 // If the function contains changes to streaming mode, then there 9464 // is a danger that rematerialised instructions end up between 9465 // instruction sequences (e.g. call sequences, or prolog/epilogue) 9466 // where the streaming-SVE mode is temporarily changed. 9467 if (AFI.hasStreamingModeChanges()) { 9468 // Avoid rematerializing rematerializable instructions that use/define 9469 // scalable values, such as 'pfalse' or 'ptrue', which result in different 9470 // results when the runtime vector length is different. 9471 const MachineRegisterInfo &MRI = MF.getRegInfo(); 9472 const MachineFrameInfo &MFI = MF.getFrameInfo(); 9473 if (any_of(MI.operands(), [&MRI, &MFI](const MachineOperand &MO) { 9474 if (MO.isFI() && 9475 MFI.getStackID(MO.getIndex()) == TargetStackID::ScalableVector) 9476 return true; 9477 if (!MO.isReg()) 9478 return false; 9479 9480 if (MO.getReg().isVirtual()) { 9481 const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); 9482 return AArch64::ZPRRegClass.hasSubClassEq(RC) || 9483 AArch64::PPRRegClass.hasSubClassEq(RC); 9484 } 9485 return AArch64::ZPRRegClass.contains(MO.getReg()) || 9486 AArch64::PPRRegClass.contains(MO.getReg()); 9487 })) 9488 return false; 9489 9490 // Avoid rematerializing instructions that return a value that is 9491 // different depending on vector length, even when it is not returned 9492 // in a scalable vector/predicate register. 9493 switch (MI.getOpcode()) { 9494 default: 9495 break; 9496 case AArch64::RDVLI_XI: 9497 case AArch64::ADDVL_XXI: 9498 case AArch64::ADDPL_XXI: 9499 case AArch64::CNTB_XPiI: 9500 case AArch64::CNTH_XPiI: 9501 case AArch64::CNTW_XPiI: 9502 case AArch64::CNTD_XPiI: 9503 return false; 9504 } 9505 } 9506 9507 return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); 9508 } 9509 9510 MachineBasicBlock::iterator 9511 AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, 9512 Register TargetReg, bool FrameSetup) const { 9513 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP"); 9514 9515 MachineBasicBlock &MBB = *MBBI->getParent(); 9516 MachineFunction &MF = *MBB.getParent(); 9517 const AArch64InstrInfo *TII = 9518 MF.getSubtarget<AArch64Subtarget>().getInstrInfo(); 9519 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize(); 9520 DebugLoc DL = MBB.findDebugLoc(MBBI); 9521 9522 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); 9523 MachineBasicBlock *LoopTestMBB = 9524 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9525 MF.insert(MBBInsertPoint, LoopTestMBB); 9526 MachineBasicBlock *LoopBodyMBB = 9527 MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9528 MF.insert(MBBInsertPoint, LoopBodyMBB); 9529 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); 9530 MF.insert(MBBInsertPoint, ExitMBB); 9531 MachineInstr::MIFlag Flags = 9532 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags; 9533 9534 // LoopTest: 9535 // SUB SP, SP, #ProbeSize 9536 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP, 9537 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags); 9538 9539 // CMP SP, TargetReg 9540 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64), 9541 AArch64::XZR) 9542 .addReg(AArch64::SP) 9543 .addReg(TargetReg) 9544 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0)) 9545 .setMIFlags(Flags); 9546 9547 // B.<Cond> LoopExit 9548 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc)) 9549 .addImm(AArch64CC::LE) 9550 .addMBB(ExitMBB) 9551 .setMIFlags(Flags); 9552 9553 // STR XZR, [SP] 9554 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui)) 9555 .addReg(AArch64::XZR) 9556 .addReg(AArch64::SP) 9557 .addImm(0) 9558 .setMIFlags(Flags); 9559 9560 // B loop 9561 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B)) 9562 .addMBB(LoopTestMBB) 9563 .setMIFlags(Flags); 9564 9565 // LoopExit: 9566 // MOV SP, TargetReg 9567 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP) 9568 .addReg(TargetReg) 9569 .addImm(0) 9570 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 9571 .setMIFlags(Flags); 9572 9573 // LDR XZR, [SP] 9574 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui)) 9575 .addReg(AArch64::XZR, RegState::Define) 9576 .addReg(AArch64::SP) 9577 .addImm(0) 9578 .setMIFlags(Flags); 9579 9580 ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); 9581 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); 9582 9583 LoopTestMBB->addSuccessor(ExitMBB); 9584 LoopTestMBB->addSuccessor(LoopBodyMBB); 9585 LoopBodyMBB->addSuccessor(LoopTestMBB); 9586 MBB.addSuccessor(LoopTestMBB); 9587 9588 // Update liveins. 9589 if (MF.getRegInfo().reservedRegsFrozen()) { 9590 bool anyChange = false; 9591 do { 9592 anyChange = recomputeLiveIns(*ExitMBB) || 9593 recomputeLiveIns(*LoopBodyMBB) || 9594 recomputeLiveIns(*LoopTestMBB); 9595 } while (anyChange); 9596 ; 9597 } 9598 9599 return ExitMBB->begin(); 9600 } 9601 9602 #define GET_INSTRINFO_HELPERS 9603 #define GET_INSTRMAP_INFO 9604 #include "AArch64GenInstrInfo.inc" 9605