1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineCombinerPattern.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/CodeGen/MachineRegisterInfo.h" 31 #include "llvm/CodeGen/StackMaps.h" 32 #include "llvm/CodeGen/TargetRegisterInfo.h" 33 #include "llvm/CodeGen/TargetSubtargetInfo.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/GlobalValue.h" 37 #include "llvm/MC/MCAsmInfo.h" 38 #include "llvm/MC/MCInst.h" 39 #include "llvm/MC/MCInstBuilder.h" 40 #include "llvm/MC/MCInstrDesc.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CodeGen.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Compiler.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/LEB128.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include "llvm/Target/TargetOptions.h" 50 #include <cassert> 51 #include <cstdint> 52 #include <iterator> 53 #include <utility> 54 55 using namespace llvm; 56 57 #define GET_INSTRINFO_CTOR_DTOR 58 #include "AArch64GenInstrInfo.inc" 59 60 static cl::opt<unsigned> TBZDisplacementBits( 61 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 62 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 63 64 static cl::opt<unsigned> CBZDisplacementBits( 65 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 66 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 67 68 static cl::opt<unsigned> 69 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 70 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 71 72 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 73 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 74 AArch64::CATCHRET), 75 RI(STI.getTargetTriple()), Subtarget(STI) {} 76 77 /// GetInstSize - Return the number of bytes of code the specified 78 /// instruction may be. This returns the maximum number of bytes. 79 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 80 const MachineBasicBlock &MBB = *MI.getParent(); 81 const MachineFunction *MF = MBB.getParent(); 82 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 83 84 { 85 auto Op = MI.getOpcode(); 86 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 87 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 88 } 89 90 // Meta-instructions emit no code. 91 if (MI.isMetaInstruction()) 92 return 0; 93 94 // FIXME: We currently only handle pseudoinstructions that don't get expanded 95 // before the assembly printer. 96 unsigned NumBytes = 0; 97 const MCInstrDesc &Desc = MI.getDesc(); 98 99 // Size should be preferably set in 100 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 101 // Specific cases handle instructions of variable sizes 102 switch (Desc.getOpcode()) { 103 default: 104 if (Desc.getSize()) 105 return Desc.getSize(); 106 107 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 108 // with fixed constant size but not specified in .td file) is a normal 109 // 4-byte insn. 110 NumBytes = 4; 111 break; 112 case TargetOpcode::STACKMAP: 113 // The upper bound for a stackmap intrinsic is the full length of its shadow 114 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 115 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 116 break; 117 case TargetOpcode::PATCHPOINT: 118 // The size of the patchpoint intrinsic is the number of bytes requested 119 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 120 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 121 break; 122 case TargetOpcode::STATEPOINT: 123 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 125 // No patch bytes means a normal call inst is emitted 126 if (NumBytes == 0) 127 NumBytes = 4; 128 break; 129 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 130 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 131 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 132 NumBytes = 36; 133 break; 134 135 case AArch64::SPACE: 136 NumBytes = MI.getOperand(1).getImm(); 137 break; 138 case TargetOpcode::BUNDLE: 139 NumBytes = getInstBundleLength(MI); 140 break; 141 } 142 143 return NumBytes; 144 } 145 146 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 147 unsigned Size = 0; 148 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 149 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 150 while (++I != E && I->isInsideBundle()) { 151 assert(!I->isBundle() && "No nested bundle!"); 152 Size += getInstSizeInBytes(*I); 153 } 154 return Size; 155 } 156 157 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 158 SmallVectorImpl<MachineOperand> &Cond) { 159 // Block ends with fall-through condbranch. 160 switch (LastInst->getOpcode()) { 161 default: 162 llvm_unreachable("Unknown branch instruction?"); 163 case AArch64::Bcc: 164 Target = LastInst->getOperand(1).getMBB(); 165 Cond.push_back(LastInst->getOperand(0)); 166 break; 167 case AArch64::CBZW: 168 case AArch64::CBZX: 169 case AArch64::CBNZW: 170 case AArch64::CBNZX: 171 Target = LastInst->getOperand(1).getMBB(); 172 Cond.push_back(MachineOperand::CreateImm(-1)); 173 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 174 Cond.push_back(LastInst->getOperand(0)); 175 break; 176 case AArch64::TBZW: 177 case AArch64::TBZX: 178 case AArch64::TBNZW: 179 case AArch64::TBNZX: 180 Target = LastInst->getOperand(2).getMBB(); 181 Cond.push_back(MachineOperand::CreateImm(-1)); 182 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 183 Cond.push_back(LastInst->getOperand(0)); 184 Cond.push_back(LastInst->getOperand(1)); 185 } 186 } 187 188 static unsigned getBranchDisplacementBits(unsigned Opc) { 189 switch (Opc) { 190 default: 191 llvm_unreachable("unexpected opcode!"); 192 case AArch64::B: 193 return 64; 194 case AArch64::TBNZW: 195 case AArch64::TBZW: 196 case AArch64::TBNZX: 197 case AArch64::TBZX: 198 return TBZDisplacementBits; 199 case AArch64::CBNZW: 200 case AArch64::CBZW: 201 case AArch64::CBNZX: 202 case AArch64::CBZX: 203 return CBZDisplacementBits; 204 case AArch64::Bcc: 205 return BCCDisplacementBits; 206 } 207 } 208 209 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 210 int64_t BrOffset) const { 211 unsigned Bits = getBranchDisplacementBits(BranchOp); 212 assert(Bits >= 3 && "max branch displacement must be enough to jump" 213 "over conditional branch expansion"); 214 return isIntN(Bits, BrOffset / 4); 215 } 216 217 MachineBasicBlock * 218 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 219 switch (MI.getOpcode()) { 220 default: 221 llvm_unreachable("unexpected opcode!"); 222 case AArch64::B: 223 return MI.getOperand(0).getMBB(); 224 case AArch64::TBZW: 225 case AArch64::TBNZW: 226 case AArch64::TBZX: 227 case AArch64::TBNZX: 228 return MI.getOperand(2).getMBB(); 229 case AArch64::CBZW: 230 case AArch64::CBNZW: 231 case AArch64::CBZX: 232 case AArch64::CBNZX: 233 case AArch64::Bcc: 234 return MI.getOperand(1).getMBB(); 235 } 236 } 237 238 // Branch analysis. 239 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 240 MachineBasicBlock *&TBB, 241 MachineBasicBlock *&FBB, 242 SmallVectorImpl<MachineOperand> &Cond, 243 bool AllowModify) const { 244 // If the block has no terminators, it just falls into the block after it. 245 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 246 if (I == MBB.end()) 247 return false; 248 249 // Skip over SpeculationBarrierEndBB terminators 250 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 251 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 252 --I; 253 } 254 255 if (!isUnpredicatedTerminator(*I)) 256 return false; 257 258 // Get the last instruction in the block. 259 MachineInstr *LastInst = &*I; 260 261 // If there is only one terminator instruction, process it. 262 unsigned LastOpc = LastInst->getOpcode(); 263 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 264 if (isUncondBranchOpcode(LastOpc)) { 265 TBB = LastInst->getOperand(0).getMBB(); 266 return false; 267 } 268 if (isCondBranchOpcode(LastOpc)) { 269 // Block ends with fall-through condbranch. 270 parseCondBranch(LastInst, TBB, Cond); 271 return false; 272 } 273 return true; // Can't handle indirect branch. 274 } 275 276 // Get the instruction before it if it is a terminator. 277 MachineInstr *SecondLastInst = &*I; 278 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 279 280 // If AllowModify is true and the block ends with two or more unconditional 281 // branches, delete all but the first unconditional branch. 282 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 283 while (isUncondBranchOpcode(SecondLastOpc)) { 284 LastInst->eraseFromParent(); 285 LastInst = SecondLastInst; 286 LastOpc = LastInst->getOpcode(); 287 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 288 // Return now the only terminator is an unconditional branch. 289 TBB = LastInst->getOperand(0).getMBB(); 290 return false; 291 } else { 292 SecondLastInst = &*I; 293 SecondLastOpc = SecondLastInst->getOpcode(); 294 } 295 } 296 } 297 298 // If we're allowed to modify and the block ends in a unconditional branch 299 // which could simply fallthrough, remove the branch. (Note: This case only 300 // matters when we can't understand the whole sequence, otherwise it's also 301 // handled by BranchFolding.cpp.) 302 if (AllowModify && isUncondBranchOpcode(LastOpc) && 303 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 304 LastInst->eraseFromParent(); 305 LastInst = SecondLastInst; 306 LastOpc = LastInst->getOpcode(); 307 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 308 assert(!isUncondBranchOpcode(LastOpc) && 309 "unreachable unconditional branches removed above"); 310 311 if (isCondBranchOpcode(LastOpc)) { 312 // Block ends with fall-through condbranch. 313 parseCondBranch(LastInst, TBB, Cond); 314 return false; 315 } 316 return true; // Can't handle indirect branch. 317 } else { 318 SecondLastInst = &*I; 319 SecondLastOpc = SecondLastInst->getOpcode(); 320 } 321 } 322 323 // If there are three terminators, we don't know what sort of block this is. 324 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 325 return true; 326 327 // If the block ends with a B and a Bcc, handle it. 328 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 329 parseCondBranch(SecondLastInst, TBB, Cond); 330 FBB = LastInst->getOperand(0).getMBB(); 331 return false; 332 } 333 334 // If the block ends with two unconditional branches, handle it. The second 335 // one is not executed, so remove it. 336 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 337 TBB = SecondLastInst->getOperand(0).getMBB(); 338 I = LastInst; 339 if (AllowModify) 340 I->eraseFromParent(); 341 return false; 342 } 343 344 // ...likewise if it ends with an indirect branch followed by an unconditional 345 // branch. 346 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 347 I = LastInst; 348 if (AllowModify) 349 I->eraseFromParent(); 350 return true; 351 } 352 353 // Otherwise, can't handle this. 354 return true; 355 } 356 357 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 358 MachineBranchPredicate &MBP, 359 bool AllowModify) const { 360 // For the moment, handle only a block which ends with a cb(n)zx followed by 361 // a fallthrough. Why this? Because it is a common form. 362 // TODO: Should we handle b.cc? 363 364 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 365 if (I == MBB.end()) 366 return true; 367 368 // Skip over SpeculationBarrierEndBB terminators 369 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 370 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 371 --I; 372 } 373 374 if (!isUnpredicatedTerminator(*I)) 375 return true; 376 377 // Get the last instruction in the block. 378 MachineInstr *LastInst = &*I; 379 unsigned LastOpc = LastInst->getOpcode(); 380 if (!isCondBranchOpcode(LastOpc)) 381 return true; 382 383 switch (LastOpc) { 384 default: 385 return true; 386 case AArch64::CBZW: 387 case AArch64::CBZX: 388 case AArch64::CBNZW: 389 case AArch64::CBNZX: 390 break; 391 }; 392 393 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 394 assert(MBP.TrueDest && "expected!"); 395 MBP.FalseDest = MBB.getNextNode(); 396 397 MBP.ConditionDef = nullptr; 398 MBP.SingleUseCondition = false; 399 400 MBP.LHS = LastInst->getOperand(0); 401 MBP.RHS = MachineOperand::CreateImm(0); 402 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 403 : MachineBranchPredicate::PRED_EQ; 404 return false; 405 } 406 407 bool AArch64InstrInfo::reverseBranchCondition( 408 SmallVectorImpl<MachineOperand> &Cond) const { 409 if (Cond[0].getImm() != -1) { 410 // Regular Bcc 411 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 412 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 413 } else { 414 // Folded compare-and-branch 415 switch (Cond[1].getImm()) { 416 default: 417 llvm_unreachable("Unknown conditional branch!"); 418 case AArch64::CBZW: 419 Cond[1].setImm(AArch64::CBNZW); 420 break; 421 case AArch64::CBNZW: 422 Cond[1].setImm(AArch64::CBZW); 423 break; 424 case AArch64::CBZX: 425 Cond[1].setImm(AArch64::CBNZX); 426 break; 427 case AArch64::CBNZX: 428 Cond[1].setImm(AArch64::CBZX); 429 break; 430 case AArch64::TBZW: 431 Cond[1].setImm(AArch64::TBNZW); 432 break; 433 case AArch64::TBNZW: 434 Cond[1].setImm(AArch64::TBZW); 435 break; 436 case AArch64::TBZX: 437 Cond[1].setImm(AArch64::TBNZX); 438 break; 439 case AArch64::TBNZX: 440 Cond[1].setImm(AArch64::TBZX); 441 break; 442 } 443 } 444 445 return false; 446 } 447 448 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 449 int *BytesRemoved) const { 450 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 451 if (I == MBB.end()) 452 return 0; 453 454 if (!isUncondBranchOpcode(I->getOpcode()) && 455 !isCondBranchOpcode(I->getOpcode())) 456 return 0; 457 458 // Remove the branch. 459 I->eraseFromParent(); 460 461 I = MBB.end(); 462 463 if (I == MBB.begin()) { 464 if (BytesRemoved) 465 *BytesRemoved = 4; 466 return 1; 467 } 468 --I; 469 if (!isCondBranchOpcode(I->getOpcode())) { 470 if (BytesRemoved) 471 *BytesRemoved = 4; 472 return 1; 473 } 474 475 // Remove the branch. 476 I->eraseFromParent(); 477 if (BytesRemoved) 478 *BytesRemoved = 8; 479 480 return 2; 481 } 482 483 void AArch64InstrInfo::instantiateCondBranch( 484 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 485 ArrayRef<MachineOperand> Cond) const { 486 if (Cond[0].getImm() != -1) { 487 // Regular Bcc 488 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 489 } else { 490 // Folded compare-and-branch 491 // Note that we use addOperand instead of addReg to keep the flags. 492 const MachineInstrBuilder MIB = 493 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 494 if (Cond.size() > 3) 495 MIB.addImm(Cond[3].getImm()); 496 MIB.addMBB(TBB); 497 } 498 } 499 500 unsigned AArch64InstrInfo::insertBranch( 501 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 502 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 503 // Shouldn't be a fall through. 504 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 505 506 if (!FBB) { 507 if (Cond.empty()) // Unconditional branch? 508 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 509 else 510 instantiateCondBranch(MBB, DL, TBB, Cond); 511 512 if (BytesAdded) 513 *BytesAdded = 4; 514 515 return 1; 516 } 517 518 // Two-way conditional branch. 519 instantiateCondBranch(MBB, DL, TBB, Cond); 520 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 521 522 if (BytesAdded) 523 *BytesAdded = 8; 524 525 return 2; 526 } 527 528 // Find the original register that VReg is copied from. 529 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 530 while (Register::isVirtualRegister(VReg)) { 531 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 532 if (!DefMI->isFullCopy()) 533 return VReg; 534 VReg = DefMI->getOperand(1).getReg(); 535 } 536 return VReg; 537 } 538 539 // Determine if VReg is defined by an instruction that can be folded into a 540 // csel instruction. If so, return the folded opcode, and the replacement 541 // register. 542 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 543 unsigned *NewVReg = nullptr) { 544 VReg = removeCopies(MRI, VReg); 545 if (!Register::isVirtualRegister(VReg)) 546 return 0; 547 548 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 549 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 550 unsigned Opc = 0; 551 unsigned SrcOpNum = 0; 552 switch (DefMI->getOpcode()) { 553 case AArch64::ADDSXri: 554 case AArch64::ADDSWri: 555 // if NZCV is used, do not fold. 556 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 557 return 0; 558 // fall-through to ADDXri and ADDWri. 559 [[fallthrough]]; 560 case AArch64::ADDXri: 561 case AArch64::ADDWri: 562 // add x, 1 -> csinc. 563 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 564 DefMI->getOperand(3).getImm() != 0) 565 return 0; 566 SrcOpNum = 1; 567 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 568 break; 569 570 case AArch64::ORNXrr: 571 case AArch64::ORNWrr: { 572 // not x -> csinv, represented as orn dst, xzr, src. 573 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 574 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 575 return 0; 576 SrcOpNum = 2; 577 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 578 break; 579 } 580 581 case AArch64::SUBSXrr: 582 case AArch64::SUBSWrr: 583 // if NZCV is used, do not fold. 584 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 585 return 0; 586 // fall-through to SUBXrr and SUBWrr. 587 [[fallthrough]]; 588 case AArch64::SUBXrr: 589 case AArch64::SUBWrr: { 590 // neg x -> csneg, represented as sub dst, xzr, src. 591 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 592 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 593 return 0; 594 SrcOpNum = 2; 595 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 596 break; 597 } 598 default: 599 return 0; 600 } 601 assert(Opc && SrcOpNum && "Missing parameters"); 602 603 if (NewVReg) 604 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 605 return Opc; 606 } 607 608 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 609 ArrayRef<MachineOperand> Cond, 610 Register DstReg, Register TrueReg, 611 Register FalseReg, int &CondCycles, 612 int &TrueCycles, 613 int &FalseCycles) const { 614 // Check register classes. 615 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 616 const TargetRegisterClass *RC = 617 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 618 if (!RC) 619 return false; 620 621 // Also need to check the dest regclass, in case we're trying to optimize 622 // something like: 623 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 624 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 625 return false; 626 627 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 628 unsigned ExtraCondLat = Cond.size() != 1; 629 630 // GPRs are handled by csel. 631 // FIXME: Fold in x+1, -x, and ~x when applicable. 632 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 633 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 634 // Single-cycle csel, csinc, csinv, and csneg. 635 CondCycles = 1 + ExtraCondLat; 636 TrueCycles = FalseCycles = 1; 637 if (canFoldIntoCSel(MRI, TrueReg)) 638 TrueCycles = 0; 639 else if (canFoldIntoCSel(MRI, FalseReg)) 640 FalseCycles = 0; 641 return true; 642 } 643 644 // Scalar floating point is handled by fcsel. 645 // FIXME: Form fabs, fmin, and fmax when applicable. 646 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 647 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 648 CondCycles = 5 + ExtraCondLat; 649 TrueCycles = FalseCycles = 2; 650 return true; 651 } 652 653 // Can't do vectors. 654 return false; 655 } 656 657 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 658 MachineBasicBlock::iterator I, 659 const DebugLoc &DL, Register DstReg, 660 ArrayRef<MachineOperand> Cond, 661 Register TrueReg, Register FalseReg) const { 662 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 663 664 // Parse the condition code, see parseCondBranch() above. 665 AArch64CC::CondCode CC; 666 switch (Cond.size()) { 667 default: 668 llvm_unreachable("Unknown condition opcode in Cond"); 669 case 1: // b.cc 670 CC = AArch64CC::CondCode(Cond[0].getImm()); 671 break; 672 case 3: { // cbz/cbnz 673 // We must insert a compare against 0. 674 bool Is64Bit; 675 switch (Cond[1].getImm()) { 676 default: 677 llvm_unreachable("Unknown branch opcode in Cond"); 678 case AArch64::CBZW: 679 Is64Bit = false; 680 CC = AArch64CC::EQ; 681 break; 682 case AArch64::CBZX: 683 Is64Bit = true; 684 CC = AArch64CC::EQ; 685 break; 686 case AArch64::CBNZW: 687 Is64Bit = false; 688 CC = AArch64CC::NE; 689 break; 690 case AArch64::CBNZX: 691 Is64Bit = true; 692 CC = AArch64CC::NE; 693 break; 694 } 695 Register SrcReg = Cond[2].getReg(); 696 if (Is64Bit) { 697 // cmp reg, #0 is actually subs xzr, reg, #0. 698 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 699 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 700 .addReg(SrcReg) 701 .addImm(0) 702 .addImm(0); 703 } else { 704 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 705 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 706 .addReg(SrcReg) 707 .addImm(0) 708 .addImm(0); 709 } 710 break; 711 } 712 case 4: { // tbz/tbnz 713 // We must insert a tst instruction. 714 switch (Cond[1].getImm()) { 715 default: 716 llvm_unreachable("Unknown branch opcode in Cond"); 717 case AArch64::TBZW: 718 case AArch64::TBZX: 719 CC = AArch64CC::EQ; 720 break; 721 case AArch64::TBNZW: 722 case AArch64::TBNZX: 723 CC = AArch64CC::NE; 724 break; 725 } 726 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 727 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 728 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 729 .addReg(Cond[2].getReg()) 730 .addImm( 731 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 732 else 733 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 734 .addReg(Cond[2].getReg()) 735 .addImm( 736 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 737 break; 738 } 739 } 740 741 unsigned Opc = 0; 742 const TargetRegisterClass *RC = nullptr; 743 bool TryFold = false; 744 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 745 RC = &AArch64::GPR64RegClass; 746 Opc = AArch64::CSELXr; 747 TryFold = true; 748 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 749 RC = &AArch64::GPR32RegClass; 750 Opc = AArch64::CSELWr; 751 TryFold = true; 752 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 753 RC = &AArch64::FPR64RegClass; 754 Opc = AArch64::FCSELDrrr; 755 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 756 RC = &AArch64::FPR32RegClass; 757 Opc = AArch64::FCSELSrrr; 758 } 759 assert(RC && "Unsupported regclass"); 760 761 // Try folding simple instructions into the csel. 762 if (TryFold) { 763 unsigned NewVReg = 0; 764 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 765 if (FoldedOpc) { 766 // The folded opcodes csinc, csinc and csneg apply the operation to 767 // FalseReg, so we need to invert the condition. 768 CC = AArch64CC::getInvertedCondCode(CC); 769 TrueReg = FalseReg; 770 } else 771 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 772 773 // Fold the operation. Leave any dead instructions for DCE to clean up. 774 if (FoldedOpc) { 775 FalseReg = NewVReg; 776 Opc = FoldedOpc; 777 // The extends the live range of NewVReg. 778 MRI.clearKillFlags(NewVReg); 779 } 780 } 781 782 // Pull all virtual register into the appropriate class. 783 MRI.constrainRegClass(TrueReg, RC); 784 MRI.constrainRegClass(FalseReg, RC); 785 786 // Insert the csel. 787 BuildMI(MBB, I, DL, get(Opc), DstReg) 788 .addReg(TrueReg) 789 .addReg(FalseReg) 790 .addImm(CC); 791 } 792 793 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 794 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 795 uint64_t Imm = MI.getOperand(1).getImm(); 796 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 797 uint64_t Encoding; 798 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 799 } 800 801 // FIXME: this implementation should be micro-architecture dependent, so a 802 // micro-architecture target hook should be introduced here in future. 803 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 804 if (!Subtarget.hasCustomCheapAsMoveHandling()) 805 return MI.isAsCheapAsAMove(); 806 807 const unsigned Opcode = MI.getOpcode(); 808 809 // Firstly, check cases gated by features. 810 811 if (Subtarget.hasZeroCycleZeroingFP()) { 812 if (Opcode == AArch64::FMOVH0 || 813 Opcode == AArch64::FMOVS0 || 814 Opcode == AArch64::FMOVD0) 815 return true; 816 } 817 818 if (Subtarget.hasZeroCycleZeroingGP()) { 819 if (Opcode == TargetOpcode::COPY && 820 (MI.getOperand(1).getReg() == AArch64::WZR || 821 MI.getOperand(1).getReg() == AArch64::XZR)) 822 return true; 823 } 824 825 // Secondly, check cases specific to sub-targets. 826 827 if (Subtarget.hasExynosCheapAsMoveHandling()) { 828 if (isExynosCheapAsMove(MI)) 829 return true; 830 831 return MI.isAsCheapAsAMove(); 832 } 833 834 // Finally, check generic cases. 835 836 switch (Opcode) { 837 default: 838 return false; 839 840 // add/sub on register without shift 841 case AArch64::ADDWri: 842 case AArch64::ADDXri: 843 case AArch64::SUBWri: 844 case AArch64::SUBXri: 845 return (MI.getOperand(3).getImm() == 0); 846 847 // logical ops on immediate 848 case AArch64::ANDWri: 849 case AArch64::ANDXri: 850 case AArch64::EORWri: 851 case AArch64::EORXri: 852 case AArch64::ORRWri: 853 case AArch64::ORRXri: 854 return true; 855 856 // logical ops on register without shift 857 case AArch64::ANDWrr: 858 case AArch64::ANDXrr: 859 case AArch64::BICWrr: 860 case AArch64::BICXrr: 861 case AArch64::EONWrr: 862 case AArch64::EONXrr: 863 case AArch64::EORWrr: 864 case AArch64::EORXrr: 865 case AArch64::ORNWrr: 866 case AArch64::ORNXrr: 867 case AArch64::ORRWrr: 868 case AArch64::ORRXrr: 869 return true; 870 871 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 872 // ORRXri, it is as cheap as MOV 873 case AArch64::MOVi32imm: 874 return canBeExpandedToORR(MI, 32); 875 case AArch64::MOVi64imm: 876 return canBeExpandedToORR(MI, 64); 877 } 878 879 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 880 } 881 882 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 883 switch (MI.getOpcode()) { 884 default: 885 return false; 886 887 case AArch64::ADDWrs: 888 case AArch64::ADDXrs: 889 case AArch64::ADDSWrs: 890 case AArch64::ADDSXrs: { 891 unsigned Imm = MI.getOperand(3).getImm(); 892 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 893 if (ShiftVal == 0) 894 return true; 895 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 896 } 897 898 case AArch64::ADDWrx: 899 case AArch64::ADDXrx: 900 case AArch64::ADDXrx64: 901 case AArch64::ADDSWrx: 902 case AArch64::ADDSXrx: 903 case AArch64::ADDSXrx64: { 904 unsigned Imm = MI.getOperand(3).getImm(); 905 switch (AArch64_AM::getArithExtendType(Imm)) { 906 default: 907 return false; 908 case AArch64_AM::UXTB: 909 case AArch64_AM::UXTH: 910 case AArch64_AM::UXTW: 911 case AArch64_AM::UXTX: 912 return AArch64_AM::getArithShiftValue(Imm) <= 4; 913 } 914 } 915 916 case AArch64::SUBWrs: 917 case AArch64::SUBSWrs: { 918 unsigned Imm = MI.getOperand(3).getImm(); 919 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 920 return ShiftVal == 0 || 921 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 922 } 923 924 case AArch64::SUBXrs: 925 case AArch64::SUBSXrs: { 926 unsigned Imm = MI.getOperand(3).getImm(); 927 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 928 return ShiftVal == 0 || 929 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 930 } 931 932 case AArch64::SUBWrx: 933 case AArch64::SUBXrx: 934 case AArch64::SUBXrx64: 935 case AArch64::SUBSWrx: 936 case AArch64::SUBSXrx: 937 case AArch64::SUBSXrx64: { 938 unsigned Imm = MI.getOperand(3).getImm(); 939 switch (AArch64_AM::getArithExtendType(Imm)) { 940 default: 941 return false; 942 case AArch64_AM::UXTB: 943 case AArch64_AM::UXTH: 944 case AArch64_AM::UXTW: 945 case AArch64_AM::UXTX: 946 return AArch64_AM::getArithShiftValue(Imm) == 0; 947 } 948 } 949 950 case AArch64::LDRBBroW: 951 case AArch64::LDRBBroX: 952 case AArch64::LDRBroW: 953 case AArch64::LDRBroX: 954 case AArch64::LDRDroW: 955 case AArch64::LDRDroX: 956 case AArch64::LDRHHroW: 957 case AArch64::LDRHHroX: 958 case AArch64::LDRHroW: 959 case AArch64::LDRHroX: 960 case AArch64::LDRQroW: 961 case AArch64::LDRQroX: 962 case AArch64::LDRSBWroW: 963 case AArch64::LDRSBWroX: 964 case AArch64::LDRSBXroW: 965 case AArch64::LDRSBXroX: 966 case AArch64::LDRSHWroW: 967 case AArch64::LDRSHWroX: 968 case AArch64::LDRSHXroW: 969 case AArch64::LDRSHXroX: 970 case AArch64::LDRSWroW: 971 case AArch64::LDRSWroX: 972 case AArch64::LDRSroW: 973 case AArch64::LDRSroX: 974 case AArch64::LDRWroW: 975 case AArch64::LDRWroX: 976 case AArch64::LDRXroW: 977 case AArch64::LDRXroX: 978 case AArch64::PRFMroW: 979 case AArch64::PRFMroX: 980 case AArch64::STRBBroW: 981 case AArch64::STRBBroX: 982 case AArch64::STRBroW: 983 case AArch64::STRBroX: 984 case AArch64::STRDroW: 985 case AArch64::STRDroX: 986 case AArch64::STRHHroW: 987 case AArch64::STRHHroX: 988 case AArch64::STRHroW: 989 case AArch64::STRHroX: 990 case AArch64::STRQroW: 991 case AArch64::STRQroX: 992 case AArch64::STRSroW: 993 case AArch64::STRSroX: 994 case AArch64::STRWroW: 995 case AArch64::STRWroX: 996 case AArch64::STRXroW: 997 case AArch64::STRXroX: { 998 unsigned IsSigned = MI.getOperand(3).getImm(); 999 return !IsSigned; 1000 } 1001 } 1002 } 1003 1004 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1005 unsigned Opc = MI.getOpcode(); 1006 switch (Opc) { 1007 default: 1008 return false; 1009 case AArch64::SEH_StackAlloc: 1010 case AArch64::SEH_SaveFPLR: 1011 case AArch64::SEH_SaveFPLR_X: 1012 case AArch64::SEH_SaveReg: 1013 case AArch64::SEH_SaveReg_X: 1014 case AArch64::SEH_SaveRegP: 1015 case AArch64::SEH_SaveRegP_X: 1016 case AArch64::SEH_SaveFReg: 1017 case AArch64::SEH_SaveFReg_X: 1018 case AArch64::SEH_SaveFRegP: 1019 case AArch64::SEH_SaveFRegP_X: 1020 case AArch64::SEH_SetFP: 1021 case AArch64::SEH_AddFP: 1022 case AArch64::SEH_Nop: 1023 case AArch64::SEH_PrologEnd: 1024 case AArch64::SEH_EpilogStart: 1025 case AArch64::SEH_EpilogEnd: 1026 case AArch64::SEH_PACSignLR: 1027 return true; 1028 } 1029 } 1030 1031 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1032 Register &SrcReg, Register &DstReg, 1033 unsigned &SubIdx) const { 1034 switch (MI.getOpcode()) { 1035 default: 1036 return false; 1037 case AArch64::SBFMXri: // aka sxtw 1038 case AArch64::UBFMXri: // aka uxtw 1039 // Check for the 32 -> 64 bit extension case, these instructions can do 1040 // much more. 1041 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1042 return false; 1043 // This is a signed or unsigned 32 -> 64 bit extension. 1044 SrcReg = MI.getOperand(1).getReg(); 1045 DstReg = MI.getOperand(0).getReg(); 1046 SubIdx = AArch64::sub_32; 1047 return true; 1048 } 1049 } 1050 1051 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1052 const MachineInstr &MIa, const MachineInstr &MIb) const { 1053 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1054 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1055 int64_t OffsetA = 0, OffsetB = 0; 1056 unsigned WidthA = 0, WidthB = 0; 1057 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1058 1059 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1060 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1061 1062 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1063 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1064 return false; 1065 1066 // Retrieve the base, offset from the base and width. Width 1067 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1068 // base are identical, and the offset of a lower memory access + 1069 // the width doesn't overlap the offset of a higher memory access, 1070 // then the memory accesses are different. 1071 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1072 // are assumed to have the same scale (vscale). 1073 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1074 WidthA, TRI) && 1075 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1076 WidthB, TRI)) { 1077 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1078 OffsetAIsScalable == OffsetBIsScalable) { 1079 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1080 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1081 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1082 if (LowOffset + LowWidth <= HighOffset) 1083 return true; 1084 } 1085 } 1086 return false; 1087 } 1088 1089 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1090 const MachineBasicBlock *MBB, 1091 const MachineFunction &MF) const { 1092 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1093 return true; 1094 switch (MI.getOpcode()) { 1095 case AArch64::HINT: 1096 // CSDB hints are scheduling barriers. 1097 if (MI.getOperand(0).getImm() == 0x14) 1098 return true; 1099 break; 1100 case AArch64::DSB: 1101 case AArch64::ISB: 1102 // DSB and ISB also are scheduling barriers. 1103 return true; 1104 case AArch64::MSRpstatesvcrImm1: 1105 // SMSTART and SMSTOP are also scheduling barriers. 1106 return true; 1107 default:; 1108 } 1109 if (isSEHInstruction(MI)) 1110 return true; 1111 auto Next = std::next(MI.getIterator()); 1112 return Next != MBB->end() && Next->isCFIInstruction(); 1113 } 1114 1115 /// analyzeCompare - For a comparison instruction, return the source registers 1116 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1117 /// Return true if the comparison instruction can be analyzed. 1118 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1119 Register &SrcReg2, int64_t &CmpMask, 1120 int64_t &CmpValue) const { 1121 // The first operand can be a frame index where we'd normally expect a 1122 // register. 1123 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1124 if (!MI.getOperand(1).isReg()) 1125 return false; 1126 1127 switch (MI.getOpcode()) { 1128 default: 1129 break; 1130 case AArch64::PTEST_PP: 1131 case AArch64::PTEST_PP_ANY: 1132 SrcReg = MI.getOperand(0).getReg(); 1133 SrcReg2 = MI.getOperand(1).getReg(); 1134 // Not sure about the mask and value for now... 1135 CmpMask = ~0; 1136 CmpValue = 0; 1137 return true; 1138 case AArch64::SUBSWrr: 1139 case AArch64::SUBSWrs: 1140 case AArch64::SUBSWrx: 1141 case AArch64::SUBSXrr: 1142 case AArch64::SUBSXrs: 1143 case AArch64::SUBSXrx: 1144 case AArch64::ADDSWrr: 1145 case AArch64::ADDSWrs: 1146 case AArch64::ADDSWrx: 1147 case AArch64::ADDSXrr: 1148 case AArch64::ADDSXrs: 1149 case AArch64::ADDSXrx: 1150 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1151 SrcReg = MI.getOperand(1).getReg(); 1152 SrcReg2 = MI.getOperand(2).getReg(); 1153 CmpMask = ~0; 1154 CmpValue = 0; 1155 return true; 1156 case AArch64::SUBSWri: 1157 case AArch64::ADDSWri: 1158 case AArch64::SUBSXri: 1159 case AArch64::ADDSXri: 1160 SrcReg = MI.getOperand(1).getReg(); 1161 SrcReg2 = 0; 1162 CmpMask = ~0; 1163 CmpValue = MI.getOperand(2).getImm(); 1164 return true; 1165 case AArch64::ANDSWri: 1166 case AArch64::ANDSXri: 1167 // ANDS does not use the same encoding scheme as the others xxxS 1168 // instructions. 1169 SrcReg = MI.getOperand(1).getReg(); 1170 SrcReg2 = 0; 1171 CmpMask = ~0; 1172 CmpValue = AArch64_AM::decodeLogicalImmediate( 1173 MI.getOperand(2).getImm(), 1174 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1175 return true; 1176 } 1177 1178 return false; 1179 } 1180 1181 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1182 MachineBasicBlock *MBB = Instr.getParent(); 1183 assert(MBB && "Can't get MachineBasicBlock here"); 1184 MachineFunction *MF = MBB->getParent(); 1185 assert(MF && "Can't get MachineFunction here"); 1186 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1187 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1188 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1189 1190 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1191 ++OpIdx) { 1192 MachineOperand &MO = Instr.getOperand(OpIdx); 1193 const TargetRegisterClass *OpRegCstraints = 1194 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1195 1196 // If there's no constraint, there's nothing to do. 1197 if (!OpRegCstraints) 1198 continue; 1199 // If the operand is a frame index, there's nothing to do here. 1200 // A frame index operand will resolve correctly during PEI. 1201 if (MO.isFI()) 1202 continue; 1203 1204 assert(MO.isReg() && 1205 "Operand has register constraints without being a register!"); 1206 1207 Register Reg = MO.getReg(); 1208 if (Reg.isPhysical()) { 1209 if (!OpRegCstraints->contains(Reg)) 1210 return false; 1211 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1212 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1213 return false; 1214 } 1215 1216 return true; 1217 } 1218 1219 /// Return the opcode that does not set flags when possible - otherwise 1220 /// return the original opcode. The caller is responsible to do the actual 1221 /// substitution and legality checking. 1222 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1223 // Don't convert all compare instructions, because for some the zero register 1224 // encoding becomes the sp register. 1225 bool MIDefinesZeroReg = false; 1226 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1227 MIDefinesZeroReg = true; 1228 1229 switch (MI.getOpcode()) { 1230 default: 1231 return MI.getOpcode(); 1232 case AArch64::ADDSWrr: 1233 return AArch64::ADDWrr; 1234 case AArch64::ADDSWri: 1235 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1236 case AArch64::ADDSWrs: 1237 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1238 case AArch64::ADDSWrx: 1239 return AArch64::ADDWrx; 1240 case AArch64::ADDSXrr: 1241 return AArch64::ADDXrr; 1242 case AArch64::ADDSXri: 1243 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1244 case AArch64::ADDSXrs: 1245 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1246 case AArch64::ADDSXrx: 1247 return AArch64::ADDXrx; 1248 case AArch64::SUBSWrr: 1249 return AArch64::SUBWrr; 1250 case AArch64::SUBSWri: 1251 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1252 case AArch64::SUBSWrs: 1253 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1254 case AArch64::SUBSWrx: 1255 return AArch64::SUBWrx; 1256 case AArch64::SUBSXrr: 1257 return AArch64::SUBXrr; 1258 case AArch64::SUBSXri: 1259 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1260 case AArch64::SUBSXrs: 1261 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1262 case AArch64::SUBSXrx: 1263 return AArch64::SUBXrx; 1264 } 1265 } 1266 1267 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1268 1269 /// True when condition flags are accessed (either by writing or reading) 1270 /// on the instruction trace starting at From and ending at To. 1271 /// 1272 /// Note: If From and To are from different blocks it's assumed CC are accessed 1273 /// on the path. 1274 static bool areCFlagsAccessedBetweenInstrs( 1275 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1276 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1277 // Early exit if To is at the beginning of the BB. 1278 if (To == To->getParent()->begin()) 1279 return true; 1280 1281 // Check whether the instructions are in the same basic block 1282 // If not, assume the condition flags might get modified somewhere. 1283 if (To->getParent() != From->getParent()) 1284 return true; 1285 1286 // From must be above To. 1287 assert(std::any_of( 1288 ++To.getReverse(), To->getParent()->rend(), 1289 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1290 1291 // We iterate backward starting at \p To until we hit \p From. 1292 for (const MachineInstr &Instr : 1293 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1294 if (((AccessToCheck & AK_Write) && 1295 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1296 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1297 return true; 1298 } 1299 return false; 1300 } 1301 1302 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1303 /// operation which could set the flags in an identical manner 1304 bool AArch64InstrInfo::optimizePTestInstr( 1305 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1306 const MachineRegisterInfo *MRI) const { 1307 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1308 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1309 auto NewOp = Pred->getOpcode(); 1310 bool OpChanged = false; 1311 1312 unsigned MaskOpcode = Mask->getOpcode(); 1313 unsigned PredOpcode = Pred->getOpcode(); 1314 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1315 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1316 1317 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && 1318 getElementSizeForOpcode(MaskOpcode) == 1319 getElementSizeForOpcode(PredOpcode) && 1320 Mask->getOperand(1).getImm() == 31) { 1321 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1322 // redundant since WHILE performs an implicit PTEST with an all active 1323 // mask. Must be an all active predicate of matching element size. 1324 1325 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1326 // PTEST_LIKE instruction uses the same all active mask and the element 1327 // size matches. If the PTEST has a condition of any then it is always 1328 // redundant. 1329 if (PredIsPTestLike) { 1330 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1331 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) 1332 return false; 1333 } 1334 1335 // Fallthough to simply remove the PTEST. 1336 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && 1337 PTest->getOpcode() == AArch64::PTEST_PP_ANY) { 1338 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1339 // instruction that sets the flags as PTEST would. This is only valid when 1340 // the condition is any. 1341 1342 // Fallthough to simply remove the PTEST. 1343 } else if (PredIsPTestLike) { 1344 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1345 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1346 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1347 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1348 // performed by the compare could consider fewer lanes for these element 1349 // sizes. 1350 // 1351 // For example, consider 1352 // 1353 // ptrue p0.b ; P0=1111-1111-1111-1111 1354 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1355 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1356 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1357 // ; ^ last active 1358 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1359 // ; ^ last active 1360 // 1361 // where the compare generates a canonical all active 32-bit predicate 1362 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1363 // active flag, whereas the PTEST instruction with the same mask doesn't. 1364 // For PTEST_ANY this doesn't apply as the flags in this case would be 1365 // identical regardless of element size. 1366 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1367 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1368 if ((Mask != PTestLikeMask) || 1369 (PredElementSize != AArch64::ElementSizeB && 1370 PTest->getOpcode() != AArch64::PTEST_PP_ANY)) 1371 return false; 1372 1373 // Fallthough to simply remove the PTEST. 1374 } else { 1375 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1376 // opcode so the PTEST becomes redundant. 1377 switch (PredOpcode) { 1378 case AArch64::AND_PPzPP: 1379 case AArch64::BIC_PPzPP: 1380 case AArch64::EOR_PPzPP: 1381 case AArch64::NAND_PPzPP: 1382 case AArch64::NOR_PPzPP: 1383 case AArch64::ORN_PPzPP: 1384 case AArch64::ORR_PPzPP: 1385 case AArch64::BRKA_PPzP: 1386 case AArch64::BRKPA_PPzPP: 1387 case AArch64::BRKB_PPzP: 1388 case AArch64::BRKPB_PPzPP: 1389 case AArch64::RDFFR_PPz: { 1390 // Check to see if our mask is the same. If not the resulting flag bits 1391 // may be different and we can't remove the ptest. 1392 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1393 if (Mask != PredMask) 1394 return false; 1395 break; 1396 } 1397 case AArch64::BRKN_PPzP: { 1398 // BRKN uses an all active implicit mask to set flags unlike the other 1399 // flag-setting instructions. 1400 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1401 if ((MaskOpcode != AArch64::PTRUE_B) || 1402 (Mask->getOperand(1).getImm() != 31)) 1403 return false; 1404 break; 1405 } 1406 case AArch64::PTRUE_B: 1407 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1408 break; 1409 default: 1410 // Bail out if we don't recognize the input 1411 return false; 1412 } 1413 1414 NewOp = convertToFlagSettingOpc(PredOpcode); 1415 OpChanged = true; 1416 } 1417 1418 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1419 1420 // If another instruction between Pred and PTest accesses flags, don't remove 1421 // the ptest or update the earlier instruction to modify them. 1422 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1423 return false; 1424 1425 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1426 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1427 // operand to be replaced with an equivalent instruction that also sets the 1428 // flags. 1429 Pred->setDesc(get(NewOp)); 1430 PTest->eraseFromParent(); 1431 if (OpChanged) { 1432 bool succeeded = UpdateOperandRegClass(*Pred); 1433 (void)succeeded; 1434 assert(succeeded && "Operands have incompatible register classes!"); 1435 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1436 } 1437 1438 // Ensure that the flags def is live. 1439 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1440 unsigned i = 0, e = Pred->getNumOperands(); 1441 for (; i != e; ++i) { 1442 MachineOperand &MO = Pred->getOperand(i); 1443 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1444 MO.setIsDead(false); 1445 break; 1446 } 1447 } 1448 } 1449 return true; 1450 } 1451 1452 /// Try to optimize a compare instruction. A compare instruction is an 1453 /// instruction which produces AArch64::NZCV. It can be truly compare 1454 /// instruction 1455 /// when there are no uses of its destination register. 1456 /// 1457 /// The following steps are tried in order: 1458 /// 1. Convert CmpInstr into an unconditional version. 1459 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1460 /// condition code or an instruction which can be converted into such an 1461 /// instruction. 1462 /// Only comparison with zero is supported. 1463 bool AArch64InstrInfo::optimizeCompareInstr( 1464 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1465 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1466 assert(CmpInstr.getParent()); 1467 assert(MRI); 1468 1469 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1470 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1471 if (DeadNZCVIdx != -1) { 1472 if (CmpInstr.definesRegister(AArch64::WZR) || 1473 CmpInstr.definesRegister(AArch64::XZR)) { 1474 CmpInstr.eraseFromParent(); 1475 return true; 1476 } 1477 unsigned Opc = CmpInstr.getOpcode(); 1478 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1479 if (NewOpc == Opc) 1480 return false; 1481 const MCInstrDesc &MCID = get(NewOpc); 1482 CmpInstr.setDesc(MCID); 1483 CmpInstr.removeOperand(DeadNZCVIdx); 1484 bool succeeded = UpdateOperandRegClass(CmpInstr); 1485 (void)succeeded; 1486 assert(succeeded && "Some operands reg class are incompatible!"); 1487 return true; 1488 } 1489 1490 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1491 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1492 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1493 1494 if (SrcReg2 != 0) 1495 return false; 1496 1497 // CmpInstr is a Compare instruction if destination register is not used. 1498 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1499 return false; 1500 1501 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1502 return true; 1503 return (CmpValue == 0 || CmpValue == 1) && 1504 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1505 } 1506 1507 /// Get opcode of S version of Instr. 1508 /// If Instr is S version its opcode is returned. 1509 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1510 /// or we are not interested in it. 1511 static unsigned sForm(MachineInstr &Instr) { 1512 switch (Instr.getOpcode()) { 1513 default: 1514 return AArch64::INSTRUCTION_LIST_END; 1515 1516 case AArch64::ADDSWrr: 1517 case AArch64::ADDSWri: 1518 case AArch64::ADDSXrr: 1519 case AArch64::ADDSXri: 1520 case AArch64::SUBSWrr: 1521 case AArch64::SUBSWri: 1522 case AArch64::SUBSXrr: 1523 case AArch64::SUBSXri: 1524 return Instr.getOpcode(); 1525 1526 case AArch64::ADDWrr: 1527 return AArch64::ADDSWrr; 1528 case AArch64::ADDWri: 1529 return AArch64::ADDSWri; 1530 case AArch64::ADDXrr: 1531 return AArch64::ADDSXrr; 1532 case AArch64::ADDXri: 1533 return AArch64::ADDSXri; 1534 case AArch64::ADCWr: 1535 return AArch64::ADCSWr; 1536 case AArch64::ADCXr: 1537 return AArch64::ADCSXr; 1538 case AArch64::SUBWrr: 1539 return AArch64::SUBSWrr; 1540 case AArch64::SUBWri: 1541 return AArch64::SUBSWri; 1542 case AArch64::SUBXrr: 1543 return AArch64::SUBSXrr; 1544 case AArch64::SUBXri: 1545 return AArch64::SUBSXri; 1546 case AArch64::SBCWr: 1547 return AArch64::SBCSWr; 1548 case AArch64::SBCXr: 1549 return AArch64::SBCSXr; 1550 case AArch64::ANDWri: 1551 return AArch64::ANDSWri; 1552 case AArch64::ANDXri: 1553 return AArch64::ANDSXri; 1554 } 1555 } 1556 1557 /// Check if AArch64::NZCV should be alive in successors of MBB. 1558 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1559 for (auto *BB : MBB->successors()) 1560 if (BB->isLiveIn(AArch64::NZCV)) 1561 return true; 1562 return false; 1563 } 1564 1565 /// \returns The condition code operand index for \p Instr if it is a branch 1566 /// or select and -1 otherwise. 1567 static int 1568 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1569 switch (Instr.getOpcode()) { 1570 default: 1571 return -1; 1572 1573 case AArch64::Bcc: { 1574 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1575 assert(Idx >= 2); 1576 return Idx - 2; 1577 } 1578 1579 case AArch64::CSINVWr: 1580 case AArch64::CSINVXr: 1581 case AArch64::CSINCWr: 1582 case AArch64::CSINCXr: 1583 case AArch64::CSELWr: 1584 case AArch64::CSELXr: 1585 case AArch64::CSNEGWr: 1586 case AArch64::CSNEGXr: 1587 case AArch64::FCSELSrrr: 1588 case AArch64::FCSELDrrr: { 1589 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1590 assert(Idx >= 1); 1591 return Idx - 1; 1592 } 1593 } 1594 } 1595 1596 /// Find a condition code used by the instruction. 1597 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1598 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1599 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1600 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1601 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1602 Instr.getOperand(CCIdx).getImm()) 1603 : AArch64CC::Invalid; 1604 } 1605 1606 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1607 assert(CC != AArch64CC::Invalid); 1608 UsedNZCV UsedFlags; 1609 switch (CC) { 1610 default: 1611 break; 1612 1613 case AArch64CC::EQ: // Z set 1614 case AArch64CC::NE: // Z clear 1615 UsedFlags.Z = true; 1616 break; 1617 1618 case AArch64CC::HI: // Z clear and C set 1619 case AArch64CC::LS: // Z set or C clear 1620 UsedFlags.Z = true; 1621 [[fallthrough]]; 1622 case AArch64CC::HS: // C set 1623 case AArch64CC::LO: // C clear 1624 UsedFlags.C = true; 1625 break; 1626 1627 case AArch64CC::MI: // N set 1628 case AArch64CC::PL: // N clear 1629 UsedFlags.N = true; 1630 break; 1631 1632 case AArch64CC::VS: // V set 1633 case AArch64CC::VC: // V clear 1634 UsedFlags.V = true; 1635 break; 1636 1637 case AArch64CC::GT: // Z clear, N and V the same 1638 case AArch64CC::LE: // Z set, N and V differ 1639 UsedFlags.Z = true; 1640 [[fallthrough]]; 1641 case AArch64CC::GE: // N and V the same 1642 case AArch64CC::LT: // N and V differ 1643 UsedFlags.N = true; 1644 UsedFlags.V = true; 1645 break; 1646 } 1647 return UsedFlags; 1648 } 1649 1650 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1651 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1652 /// \returns std::nullopt otherwise. 1653 /// 1654 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1655 std::optional<UsedNZCV> 1656 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1657 const TargetRegisterInfo &TRI, 1658 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1659 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1660 if (MI.getParent() != CmpParent) 1661 return std::nullopt; 1662 1663 if (areCFlagsAliveInSuccessors(CmpParent)) 1664 return std::nullopt; 1665 1666 UsedNZCV NZCVUsedAfterCmp; 1667 for (MachineInstr &Instr : instructionsWithoutDebug( 1668 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1669 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1670 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1671 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1672 return std::nullopt; 1673 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1674 if (CCUseInstrs) 1675 CCUseInstrs->push_back(&Instr); 1676 } 1677 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1678 break; 1679 } 1680 return NZCVUsedAfterCmp; 1681 } 1682 1683 static bool isADDSRegImm(unsigned Opcode) { 1684 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1685 } 1686 1687 static bool isSUBSRegImm(unsigned Opcode) { 1688 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1689 } 1690 1691 /// Check if CmpInstr can be substituted by MI. 1692 /// 1693 /// CmpInstr can be substituted: 1694 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1695 /// - and, MI and CmpInstr are from the same MachineBB 1696 /// - and, condition flags are not alive in successors of the CmpInstr parent 1697 /// - and, if MI opcode is the S form there must be no defs of flags between 1698 /// MI and CmpInstr 1699 /// or if MI opcode is not the S form there must be neither defs of flags 1700 /// nor uses of flags between MI and CmpInstr. 1701 /// - and, if C/V flags are not used after CmpInstr 1702 /// or if N flag is used but MI produces poison value if signed overflow 1703 /// occurs. 1704 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1705 const TargetRegisterInfo &TRI) { 1706 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1707 // that may or may not set flags. 1708 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1709 1710 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1711 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1712 return false; 1713 1714 assert((CmpInstr.getOperand(2).isImm() && 1715 CmpInstr.getOperand(2).getImm() == 0) && 1716 "Caller guarantees that CmpInstr compares with constant 0"); 1717 1718 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1719 if (!NZVCUsed || NZVCUsed->C) 1720 return false; 1721 1722 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1723 // '%vreg = add ...' or '%vreg = sub ...'. 1724 // Condition flag V is used to indicate signed overflow. 1725 // 1) MI and CmpInstr set N and V to the same value. 1726 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1727 // signed overflow occurs, so CmpInstr could still be simplified away. 1728 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1729 return false; 1730 1731 AccessKind AccessToCheck = AK_Write; 1732 if (sForm(MI) != MI.getOpcode()) 1733 AccessToCheck = AK_All; 1734 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1735 } 1736 1737 /// Substitute an instruction comparing to zero with another instruction 1738 /// which produces needed condition flags. 1739 /// 1740 /// Return true on success. 1741 bool AArch64InstrInfo::substituteCmpToZero( 1742 MachineInstr &CmpInstr, unsigned SrcReg, 1743 const MachineRegisterInfo &MRI) const { 1744 // Get the unique definition of SrcReg. 1745 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1746 if (!MI) 1747 return false; 1748 1749 const TargetRegisterInfo &TRI = getRegisterInfo(); 1750 1751 unsigned NewOpc = sForm(*MI); 1752 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1753 return false; 1754 1755 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1756 return false; 1757 1758 // Update the instruction to set NZCV. 1759 MI->setDesc(get(NewOpc)); 1760 CmpInstr.eraseFromParent(); 1761 bool succeeded = UpdateOperandRegClass(*MI); 1762 (void)succeeded; 1763 assert(succeeded && "Some operands reg class are incompatible!"); 1764 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1765 return true; 1766 } 1767 1768 /// \returns True if \p CmpInstr can be removed. 1769 /// 1770 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1771 /// codes used in \p CCUseInstrs must be inverted. 1772 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1773 int CmpValue, const TargetRegisterInfo &TRI, 1774 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1775 bool &IsInvertCC) { 1776 assert((CmpValue == 0 || CmpValue == 1) && 1777 "Only comparisons to 0 or 1 considered for removal!"); 1778 1779 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1780 unsigned MIOpc = MI.getOpcode(); 1781 if (MIOpc == AArch64::CSINCWr) { 1782 if (MI.getOperand(1).getReg() != AArch64::WZR || 1783 MI.getOperand(2).getReg() != AArch64::WZR) 1784 return false; 1785 } else if (MIOpc == AArch64::CSINCXr) { 1786 if (MI.getOperand(1).getReg() != AArch64::XZR || 1787 MI.getOperand(2).getReg() != AArch64::XZR) 1788 return false; 1789 } else { 1790 return false; 1791 } 1792 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1793 if (MICC == AArch64CC::Invalid) 1794 return false; 1795 1796 // NZCV needs to be defined 1797 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1798 return false; 1799 1800 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1801 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1802 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1803 if (CmpValue && !IsSubsRegImm) 1804 return false; 1805 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1806 return false; 1807 1808 // MI conditions allowed: eq, ne, mi, pl 1809 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1810 if (MIUsedNZCV.C || MIUsedNZCV.V) 1811 return false; 1812 1813 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1814 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1815 // Condition flags are not used in CmpInstr basic block successors and only 1816 // Z or N flags allowed to be used after CmpInstr within its basic block 1817 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1818 return false; 1819 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1820 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1821 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1822 return false; 1823 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1824 if (MIUsedNZCV.N && !CmpValue) 1825 return false; 1826 1827 // There must be no defs of flags between MI and CmpInstr 1828 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1829 return false; 1830 1831 // Condition code is inverted in the following cases: 1832 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1833 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1834 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1835 (!CmpValue && MICC == AArch64CC::NE); 1836 return true; 1837 } 1838 1839 /// Remove comparison in csinc-cmp sequence 1840 /// 1841 /// Examples: 1842 /// 1. \code 1843 /// csinc w9, wzr, wzr, ne 1844 /// cmp w9, #0 1845 /// b.eq 1846 /// \endcode 1847 /// to 1848 /// \code 1849 /// csinc w9, wzr, wzr, ne 1850 /// b.ne 1851 /// \endcode 1852 /// 1853 /// 2. \code 1854 /// csinc x2, xzr, xzr, mi 1855 /// cmp x2, #1 1856 /// b.pl 1857 /// \endcode 1858 /// to 1859 /// \code 1860 /// csinc x2, xzr, xzr, mi 1861 /// b.pl 1862 /// \endcode 1863 /// 1864 /// \param CmpInstr comparison instruction 1865 /// \return True when comparison removed 1866 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1867 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1868 const MachineRegisterInfo &MRI) const { 1869 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1870 if (!MI) 1871 return false; 1872 const TargetRegisterInfo &TRI = getRegisterInfo(); 1873 SmallVector<MachineInstr *, 4> CCUseInstrs; 1874 bool IsInvertCC = false; 1875 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1876 IsInvertCC)) 1877 return false; 1878 // Make transformation 1879 CmpInstr.eraseFromParent(); 1880 if (IsInvertCC) { 1881 // Invert condition codes in CmpInstr CC users 1882 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1883 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1884 assert(Idx >= 0 && "Unexpected instruction using CC."); 1885 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1886 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1887 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1888 CCOperand.setImm(CCUse); 1889 } 1890 } 1891 return true; 1892 } 1893 1894 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1895 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1896 MI.getOpcode() != AArch64::CATCHRET) 1897 return false; 1898 1899 MachineBasicBlock &MBB = *MI.getParent(); 1900 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1901 auto TRI = Subtarget.getRegisterInfo(); 1902 DebugLoc DL = MI.getDebugLoc(); 1903 1904 if (MI.getOpcode() == AArch64::CATCHRET) { 1905 // Skip to the first instruction before the epilog. 1906 const TargetInstrInfo *TII = 1907 MBB.getParent()->getSubtarget().getInstrInfo(); 1908 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1909 auto MBBI = MachineBasicBlock::iterator(MI); 1910 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1911 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1912 FirstEpilogSEH != MBB.begin()) 1913 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1914 if (FirstEpilogSEH != MBB.begin()) 1915 FirstEpilogSEH = std::next(FirstEpilogSEH); 1916 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1917 .addReg(AArch64::X0, RegState::Define) 1918 .addMBB(TargetMBB); 1919 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1920 .addReg(AArch64::X0, RegState::Define) 1921 .addReg(AArch64::X0) 1922 .addMBB(TargetMBB) 1923 .addImm(0); 1924 return true; 1925 } 1926 1927 Register Reg = MI.getOperand(0).getReg(); 1928 Module &M = *MBB.getParent()->getFunction().getParent(); 1929 if (M.getStackProtectorGuard() == "sysreg") { 1930 const AArch64SysReg::SysReg *SrcReg = 1931 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1932 if (!SrcReg) 1933 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1934 1935 // mrs xN, sysreg 1936 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1937 .addDef(Reg, RegState::Renamable) 1938 .addImm(SrcReg->Encoding); 1939 int Offset = M.getStackProtectorGuardOffset(); 1940 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1941 // ldr xN, [xN, #offset] 1942 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1943 .addDef(Reg) 1944 .addUse(Reg, RegState::Kill) 1945 .addImm(Offset / 8); 1946 } else if (Offset >= -256 && Offset <= 255) { 1947 // ldur xN, [xN, #offset] 1948 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1949 .addDef(Reg) 1950 .addUse(Reg, RegState::Kill) 1951 .addImm(Offset); 1952 } else if (Offset >= -4095 && Offset <= 4095) { 1953 if (Offset > 0) { 1954 // add xN, xN, #offset 1955 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1956 .addDef(Reg) 1957 .addUse(Reg, RegState::Kill) 1958 .addImm(Offset) 1959 .addImm(0); 1960 } else { 1961 // sub xN, xN, #offset 1962 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1963 .addDef(Reg) 1964 .addUse(Reg, RegState::Kill) 1965 .addImm(-Offset) 1966 .addImm(0); 1967 } 1968 // ldr xN, [xN] 1969 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1970 .addDef(Reg) 1971 .addUse(Reg, RegState::Kill) 1972 .addImm(0); 1973 } else { 1974 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1975 // than 23760. 1976 // It might be nice to use AArch64::MOVi32imm here, which would get 1977 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1978 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1979 // AArch64FrameLowering might help us find such a scratch register 1980 // though. If we failed to find a scratch register, we could emit a 1981 // stream of add instructions to build up the immediate. Or, we could try 1982 // to insert a AArch64::MOVi32imm before register allocation so that we 1983 // didn't need to scavenge for a scratch register. 1984 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1985 } 1986 MBB.erase(MI); 1987 return true; 1988 } 1989 1990 const GlobalValue *GV = 1991 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1992 const TargetMachine &TM = MBB.getParent()->getTarget(); 1993 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1994 const unsigned char MO_NC = AArch64II::MO_NC; 1995 1996 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1997 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1998 .addGlobalAddress(GV, 0, OpFlags); 1999 if (Subtarget.isTargetILP32()) { 2000 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2001 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2002 .addDef(Reg32, RegState::Dead) 2003 .addUse(Reg, RegState::Kill) 2004 .addImm(0) 2005 .addMemOperand(*MI.memoperands_begin()) 2006 .addDef(Reg, RegState::Implicit); 2007 } else { 2008 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2009 .addReg(Reg, RegState::Kill) 2010 .addImm(0) 2011 .addMemOperand(*MI.memoperands_begin()); 2012 } 2013 } else if (TM.getCodeModel() == CodeModel::Large) { 2014 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2015 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2016 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2017 .addImm(0); 2018 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2019 .addReg(Reg, RegState::Kill) 2020 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2021 .addImm(16); 2022 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2023 .addReg(Reg, RegState::Kill) 2024 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2025 .addImm(32); 2026 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2027 .addReg(Reg, RegState::Kill) 2028 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2029 .addImm(48); 2030 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2031 .addReg(Reg, RegState::Kill) 2032 .addImm(0) 2033 .addMemOperand(*MI.memoperands_begin()); 2034 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2035 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2036 .addGlobalAddress(GV, 0, OpFlags); 2037 } else { 2038 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2039 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2040 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2041 if (Subtarget.isTargetILP32()) { 2042 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2043 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2044 .addDef(Reg32, RegState::Dead) 2045 .addUse(Reg, RegState::Kill) 2046 .addGlobalAddress(GV, 0, LoFlags) 2047 .addMemOperand(*MI.memoperands_begin()) 2048 .addDef(Reg, RegState::Implicit); 2049 } else { 2050 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2051 .addReg(Reg, RegState::Kill) 2052 .addGlobalAddress(GV, 0, LoFlags) 2053 .addMemOperand(*MI.memoperands_begin()); 2054 } 2055 } 2056 2057 MBB.erase(MI); 2058 2059 return true; 2060 } 2061 2062 // Return true if this instruction simply sets its single destination register 2063 // to zero. This is equivalent to a register rename of the zero-register. 2064 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2065 switch (MI.getOpcode()) { 2066 default: 2067 break; 2068 case AArch64::MOVZWi: 2069 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2070 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2071 assert(MI.getDesc().getNumOperands() == 3 && 2072 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2073 return true; 2074 } 2075 break; 2076 case AArch64::ANDWri: // and Rd, Rzr, #imm 2077 return MI.getOperand(1).getReg() == AArch64::WZR; 2078 case AArch64::ANDXri: 2079 return MI.getOperand(1).getReg() == AArch64::XZR; 2080 case TargetOpcode::COPY: 2081 return MI.getOperand(1).getReg() == AArch64::WZR; 2082 } 2083 return false; 2084 } 2085 2086 // Return true if this instruction simply renames a general register without 2087 // modifying bits. 2088 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2089 switch (MI.getOpcode()) { 2090 default: 2091 break; 2092 case TargetOpcode::COPY: { 2093 // GPR32 copies will by lowered to ORRXrs 2094 Register DstReg = MI.getOperand(0).getReg(); 2095 return (AArch64::GPR32RegClass.contains(DstReg) || 2096 AArch64::GPR64RegClass.contains(DstReg)); 2097 } 2098 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2099 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2100 assert(MI.getDesc().getNumOperands() == 4 && 2101 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2102 return true; 2103 } 2104 break; 2105 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2106 if (MI.getOperand(2).getImm() == 0) { 2107 assert(MI.getDesc().getNumOperands() == 4 && 2108 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2109 return true; 2110 } 2111 break; 2112 } 2113 return false; 2114 } 2115 2116 // Return true if this instruction simply renames a general register without 2117 // modifying bits. 2118 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2119 switch (MI.getOpcode()) { 2120 default: 2121 break; 2122 case TargetOpcode::COPY: { 2123 Register DstReg = MI.getOperand(0).getReg(); 2124 return AArch64::FPR128RegClass.contains(DstReg); 2125 } 2126 case AArch64::ORRv16i8: 2127 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2128 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2129 "invalid ORRv16i8 operands"); 2130 return true; 2131 } 2132 break; 2133 } 2134 return false; 2135 } 2136 2137 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2138 int &FrameIndex) const { 2139 switch (MI.getOpcode()) { 2140 default: 2141 break; 2142 case AArch64::LDRWui: 2143 case AArch64::LDRXui: 2144 case AArch64::LDRBui: 2145 case AArch64::LDRHui: 2146 case AArch64::LDRSui: 2147 case AArch64::LDRDui: 2148 case AArch64::LDRQui: 2149 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2150 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2151 FrameIndex = MI.getOperand(1).getIndex(); 2152 return MI.getOperand(0).getReg(); 2153 } 2154 break; 2155 } 2156 2157 return 0; 2158 } 2159 2160 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2161 int &FrameIndex) const { 2162 switch (MI.getOpcode()) { 2163 default: 2164 break; 2165 case AArch64::STRWui: 2166 case AArch64::STRXui: 2167 case AArch64::STRBui: 2168 case AArch64::STRHui: 2169 case AArch64::STRSui: 2170 case AArch64::STRDui: 2171 case AArch64::STRQui: 2172 case AArch64::LDR_PXI: 2173 case AArch64::STR_PXI: 2174 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2175 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2176 FrameIndex = MI.getOperand(1).getIndex(); 2177 return MI.getOperand(0).getReg(); 2178 } 2179 break; 2180 } 2181 return 0; 2182 } 2183 2184 /// Check all MachineMemOperands for a hint to suppress pairing. 2185 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2186 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2187 return MMO->getFlags() & MOSuppressPair; 2188 }); 2189 } 2190 2191 /// Set a flag on the first MachineMemOperand to suppress pairing. 2192 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2193 if (MI.memoperands_empty()) 2194 return; 2195 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2196 } 2197 2198 /// Check all MachineMemOperands for a hint that the load/store is strided. 2199 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2200 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2201 return MMO->getFlags() & MOStridedAccess; 2202 }); 2203 } 2204 2205 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2206 switch (Opc) { 2207 default: 2208 return false; 2209 case AArch64::STURSi: 2210 case AArch64::STRSpre: 2211 case AArch64::STURDi: 2212 case AArch64::STRDpre: 2213 case AArch64::STURQi: 2214 case AArch64::STRQpre: 2215 case AArch64::STURBBi: 2216 case AArch64::STURHHi: 2217 case AArch64::STURWi: 2218 case AArch64::STRWpre: 2219 case AArch64::STURXi: 2220 case AArch64::STRXpre: 2221 case AArch64::LDURSi: 2222 case AArch64::LDRSpre: 2223 case AArch64::LDURDi: 2224 case AArch64::LDRDpre: 2225 case AArch64::LDURQi: 2226 case AArch64::LDRQpre: 2227 case AArch64::LDURWi: 2228 case AArch64::LDRWpre: 2229 case AArch64::LDURXi: 2230 case AArch64::LDRXpre: 2231 case AArch64::LDRSWpre: 2232 case AArch64::LDURSWi: 2233 case AArch64::LDURHHi: 2234 case AArch64::LDURBBi: 2235 case AArch64::LDURSBWi: 2236 case AArch64::LDURSHWi: 2237 return true; 2238 } 2239 } 2240 2241 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2242 switch (Opc) { 2243 default: return {}; 2244 case AArch64::PRFMui: return AArch64::PRFUMi; 2245 case AArch64::LDRXui: return AArch64::LDURXi; 2246 case AArch64::LDRWui: return AArch64::LDURWi; 2247 case AArch64::LDRBui: return AArch64::LDURBi; 2248 case AArch64::LDRHui: return AArch64::LDURHi; 2249 case AArch64::LDRSui: return AArch64::LDURSi; 2250 case AArch64::LDRDui: return AArch64::LDURDi; 2251 case AArch64::LDRQui: return AArch64::LDURQi; 2252 case AArch64::LDRBBui: return AArch64::LDURBBi; 2253 case AArch64::LDRHHui: return AArch64::LDURHHi; 2254 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2255 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2256 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2257 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2258 case AArch64::LDRSWui: return AArch64::LDURSWi; 2259 case AArch64::STRXui: return AArch64::STURXi; 2260 case AArch64::STRWui: return AArch64::STURWi; 2261 case AArch64::STRBui: return AArch64::STURBi; 2262 case AArch64::STRHui: return AArch64::STURHi; 2263 case AArch64::STRSui: return AArch64::STURSi; 2264 case AArch64::STRDui: return AArch64::STURDi; 2265 case AArch64::STRQui: return AArch64::STURQi; 2266 case AArch64::STRBBui: return AArch64::STURBBi; 2267 case AArch64::STRHHui: return AArch64::STURHHi; 2268 } 2269 } 2270 2271 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2272 switch (Opc) { 2273 default: 2274 return 2; 2275 case AArch64::LDPXi: 2276 case AArch64::LDPDi: 2277 case AArch64::STPXi: 2278 case AArch64::STPDi: 2279 case AArch64::LDNPXi: 2280 case AArch64::LDNPDi: 2281 case AArch64::STNPXi: 2282 case AArch64::STNPDi: 2283 case AArch64::LDPQi: 2284 case AArch64::STPQi: 2285 case AArch64::LDNPQi: 2286 case AArch64::STNPQi: 2287 case AArch64::LDPWi: 2288 case AArch64::LDPSi: 2289 case AArch64::STPWi: 2290 case AArch64::STPSi: 2291 case AArch64::LDNPWi: 2292 case AArch64::LDNPSi: 2293 case AArch64::STNPWi: 2294 case AArch64::STNPSi: 2295 case AArch64::LDG: 2296 case AArch64::STGPi: 2297 2298 case AArch64::LD1B_IMM: 2299 case AArch64::LD1B_H_IMM: 2300 case AArch64::LD1B_S_IMM: 2301 case AArch64::LD1B_D_IMM: 2302 case AArch64::LD1SB_H_IMM: 2303 case AArch64::LD1SB_S_IMM: 2304 case AArch64::LD1SB_D_IMM: 2305 case AArch64::LD1H_IMM: 2306 case AArch64::LD1H_S_IMM: 2307 case AArch64::LD1H_D_IMM: 2308 case AArch64::LD1SH_S_IMM: 2309 case AArch64::LD1SH_D_IMM: 2310 case AArch64::LD1W_IMM: 2311 case AArch64::LD1W_D_IMM: 2312 case AArch64::LD1SW_D_IMM: 2313 case AArch64::LD1D_IMM: 2314 2315 case AArch64::LD2B_IMM: 2316 case AArch64::LD2H_IMM: 2317 case AArch64::LD2W_IMM: 2318 case AArch64::LD2D_IMM: 2319 case AArch64::LD3B_IMM: 2320 case AArch64::LD3H_IMM: 2321 case AArch64::LD3W_IMM: 2322 case AArch64::LD3D_IMM: 2323 case AArch64::LD4B_IMM: 2324 case AArch64::LD4H_IMM: 2325 case AArch64::LD4W_IMM: 2326 case AArch64::LD4D_IMM: 2327 2328 case AArch64::ST1B_IMM: 2329 case AArch64::ST1B_H_IMM: 2330 case AArch64::ST1B_S_IMM: 2331 case AArch64::ST1B_D_IMM: 2332 case AArch64::ST1H_IMM: 2333 case AArch64::ST1H_S_IMM: 2334 case AArch64::ST1H_D_IMM: 2335 case AArch64::ST1W_IMM: 2336 case AArch64::ST1W_D_IMM: 2337 case AArch64::ST1D_IMM: 2338 2339 case AArch64::ST2B_IMM: 2340 case AArch64::ST2H_IMM: 2341 case AArch64::ST2W_IMM: 2342 case AArch64::ST2D_IMM: 2343 case AArch64::ST3B_IMM: 2344 case AArch64::ST3H_IMM: 2345 case AArch64::ST3W_IMM: 2346 case AArch64::ST3D_IMM: 2347 case AArch64::ST4B_IMM: 2348 case AArch64::ST4H_IMM: 2349 case AArch64::ST4W_IMM: 2350 case AArch64::ST4D_IMM: 2351 2352 case AArch64::LD1RB_IMM: 2353 case AArch64::LD1RB_H_IMM: 2354 case AArch64::LD1RB_S_IMM: 2355 case AArch64::LD1RB_D_IMM: 2356 case AArch64::LD1RSB_H_IMM: 2357 case AArch64::LD1RSB_S_IMM: 2358 case AArch64::LD1RSB_D_IMM: 2359 case AArch64::LD1RH_IMM: 2360 case AArch64::LD1RH_S_IMM: 2361 case AArch64::LD1RH_D_IMM: 2362 case AArch64::LD1RSH_S_IMM: 2363 case AArch64::LD1RSH_D_IMM: 2364 case AArch64::LD1RW_IMM: 2365 case AArch64::LD1RW_D_IMM: 2366 case AArch64::LD1RSW_IMM: 2367 case AArch64::LD1RD_IMM: 2368 2369 case AArch64::LDNT1B_ZRI: 2370 case AArch64::LDNT1H_ZRI: 2371 case AArch64::LDNT1W_ZRI: 2372 case AArch64::LDNT1D_ZRI: 2373 case AArch64::STNT1B_ZRI: 2374 case AArch64::STNT1H_ZRI: 2375 case AArch64::STNT1W_ZRI: 2376 case AArch64::STNT1D_ZRI: 2377 2378 case AArch64::LDNF1B_IMM: 2379 case AArch64::LDNF1B_H_IMM: 2380 case AArch64::LDNF1B_S_IMM: 2381 case AArch64::LDNF1B_D_IMM: 2382 case AArch64::LDNF1SB_H_IMM: 2383 case AArch64::LDNF1SB_S_IMM: 2384 case AArch64::LDNF1SB_D_IMM: 2385 case AArch64::LDNF1H_IMM: 2386 case AArch64::LDNF1H_S_IMM: 2387 case AArch64::LDNF1H_D_IMM: 2388 case AArch64::LDNF1SH_S_IMM: 2389 case AArch64::LDNF1SH_D_IMM: 2390 case AArch64::LDNF1W_IMM: 2391 case AArch64::LDNF1W_D_IMM: 2392 case AArch64::LDNF1SW_D_IMM: 2393 case AArch64::LDNF1D_IMM: 2394 return 3; 2395 case AArch64::ADDG: 2396 case AArch64::STGi: 2397 case AArch64::LDR_PXI: 2398 case AArch64::STR_PXI: 2399 return 2; 2400 } 2401 } 2402 2403 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2404 switch (MI.getOpcode()) { 2405 default: 2406 return false; 2407 // Scaled instructions. 2408 case AArch64::STRSui: 2409 case AArch64::STRDui: 2410 case AArch64::STRQui: 2411 case AArch64::STRXui: 2412 case AArch64::STRWui: 2413 case AArch64::LDRSui: 2414 case AArch64::LDRDui: 2415 case AArch64::LDRQui: 2416 case AArch64::LDRXui: 2417 case AArch64::LDRWui: 2418 case AArch64::LDRSWui: 2419 // Unscaled instructions. 2420 case AArch64::STURSi: 2421 case AArch64::STRSpre: 2422 case AArch64::STURDi: 2423 case AArch64::STRDpre: 2424 case AArch64::STURQi: 2425 case AArch64::STRQpre: 2426 case AArch64::STURWi: 2427 case AArch64::STRWpre: 2428 case AArch64::STURXi: 2429 case AArch64::STRXpre: 2430 case AArch64::LDURSi: 2431 case AArch64::LDRSpre: 2432 case AArch64::LDURDi: 2433 case AArch64::LDRDpre: 2434 case AArch64::LDURQi: 2435 case AArch64::LDRQpre: 2436 case AArch64::LDURWi: 2437 case AArch64::LDRWpre: 2438 case AArch64::LDURXi: 2439 case AArch64::LDRXpre: 2440 case AArch64::LDURSWi: 2441 case AArch64::LDRSWpre: 2442 return true; 2443 } 2444 } 2445 2446 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2447 switch (Opc) { 2448 default: 2449 llvm_unreachable("Opcode has no flag setting equivalent!"); 2450 // 32-bit cases: 2451 case AArch64::ADDWri: 2452 return AArch64::ADDSWri; 2453 case AArch64::ADDWrr: 2454 return AArch64::ADDSWrr; 2455 case AArch64::ADDWrs: 2456 return AArch64::ADDSWrs; 2457 case AArch64::ADDWrx: 2458 return AArch64::ADDSWrx; 2459 case AArch64::ANDWri: 2460 return AArch64::ANDSWri; 2461 case AArch64::ANDWrr: 2462 return AArch64::ANDSWrr; 2463 case AArch64::ANDWrs: 2464 return AArch64::ANDSWrs; 2465 case AArch64::BICWrr: 2466 return AArch64::BICSWrr; 2467 case AArch64::BICWrs: 2468 return AArch64::BICSWrs; 2469 case AArch64::SUBWri: 2470 return AArch64::SUBSWri; 2471 case AArch64::SUBWrr: 2472 return AArch64::SUBSWrr; 2473 case AArch64::SUBWrs: 2474 return AArch64::SUBSWrs; 2475 case AArch64::SUBWrx: 2476 return AArch64::SUBSWrx; 2477 // 64-bit cases: 2478 case AArch64::ADDXri: 2479 return AArch64::ADDSXri; 2480 case AArch64::ADDXrr: 2481 return AArch64::ADDSXrr; 2482 case AArch64::ADDXrs: 2483 return AArch64::ADDSXrs; 2484 case AArch64::ADDXrx: 2485 return AArch64::ADDSXrx; 2486 case AArch64::ANDXri: 2487 return AArch64::ANDSXri; 2488 case AArch64::ANDXrr: 2489 return AArch64::ANDSXrr; 2490 case AArch64::ANDXrs: 2491 return AArch64::ANDSXrs; 2492 case AArch64::BICXrr: 2493 return AArch64::BICSXrr; 2494 case AArch64::BICXrs: 2495 return AArch64::BICSXrs; 2496 case AArch64::SUBXri: 2497 return AArch64::SUBSXri; 2498 case AArch64::SUBXrr: 2499 return AArch64::SUBSXrr; 2500 case AArch64::SUBXrs: 2501 return AArch64::SUBSXrs; 2502 case AArch64::SUBXrx: 2503 return AArch64::SUBSXrx; 2504 // SVE instructions: 2505 case AArch64::AND_PPzPP: 2506 return AArch64::ANDS_PPzPP; 2507 case AArch64::BIC_PPzPP: 2508 return AArch64::BICS_PPzPP; 2509 case AArch64::EOR_PPzPP: 2510 return AArch64::EORS_PPzPP; 2511 case AArch64::NAND_PPzPP: 2512 return AArch64::NANDS_PPzPP; 2513 case AArch64::NOR_PPzPP: 2514 return AArch64::NORS_PPzPP; 2515 case AArch64::ORN_PPzPP: 2516 return AArch64::ORNS_PPzPP; 2517 case AArch64::ORR_PPzPP: 2518 return AArch64::ORRS_PPzPP; 2519 case AArch64::BRKA_PPzP: 2520 return AArch64::BRKAS_PPzP; 2521 case AArch64::BRKPA_PPzPP: 2522 return AArch64::BRKPAS_PPzPP; 2523 case AArch64::BRKB_PPzP: 2524 return AArch64::BRKBS_PPzP; 2525 case AArch64::BRKPB_PPzPP: 2526 return AArch64::BRKPBS_PPzPP; 2527 case AArch64::BRKN_PPzP: 2528 return AArch64::BRKNS_PPzP; 2529 case AArch64::RDFFR_PPz: 2530 return AArch64::RDFFRS_PPz; 2531 case AArch64::PTRUE_B: 2532 return AArch64::PTRUES_B; 2533 } 2534 } 2535 2536 // Is this a candidate for ld/st merging or pairing? For example, we don't 2537 // touch volatiles or load/stores that have a hint to avoid pair formation. 2538 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2539 2540 bool IsPreLdSt = isPreLdSt(MI); 2541 2542 // If this is a volatile load/store, don't mess with it. 2543 if (MI.hasOrderedMemoryRef()) 2544 return false; 2545 2546 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2547 // For Pre-inc LD/ST, the operand is shifted by one. 2548 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2549 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2550 "Expected a reg or frame index operand."); 2551 2552 // For Pre-indexed addressing quadword instructions, the third operand is the 2553 // immediate value. 2554 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2555 2556 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2557 return false; 2558 2559 // Can't merge/pair if the instruction modifies the base register. 2560 // e.g., ldr x0, [x0] 2561 // This case will never occur with an FI base. 2562 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or 2563 // STR<S,D,Q,W,X>pre, it can be merged. 2564 // For example: 2565 // ldr q0, [x11, #32]! 2566 // ldr q1, [x11, #16] 2567 // to 2568 // ldp q0, q1, [x11, #32]! 2569 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2570 Register BaseReg = MI.getOperand(1).getReg(); 2571 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2572 if (MI.modifiesRegister(BaseReg, TRI)) 2573 return false; 2574 } 2575 2576 // Check if this load/store has a hint to avoid pair formation. 2577 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2578 if (isLdStPairSuppressed(MI)) 2579 return false; 2580 2581 // Do not pair any callee-save store/reload instructions in the 2582 // prologue/epilogue if the CFI information encoded the operations as separate 2583 // instructions, as that will cause the size of the actual prologue to mismatch 2584 // with the prologue size recorded in the Windows CFI. 2585 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2586 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2587 MI.getMF()->getFunction().needsUnwindTableEntry(); 2588 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2589 MI.getFlag(MachineInstr::FrameDestroy))) 2590 return false; 2591 2592 // On some CPUs quad load/store pairs are slower than two single load/stores. 2593 if (Subtarget.isPaired128Slow()) { 2594 switch (MI.getOpcode()) { 2595 default: 2596 break; 2597 case AArch64::LDURQi: 2598 case AArch64::STURQi: 2599 case AArch64::LDRQui: 2600 case AArch64::STRQui: 2601 return false; 2602 } 2603 } 2604 2605 return true; 2606 } 2607 2608 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2609 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2610 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2611 const TargetRegisterInfo *TRI) const { 2612 if (!LdSt.mayLoadOrStore()) 2613 return false; 2614 2615 const MachineOperand *BaseOp; 2616 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2617 Width, TRI)) 2618 return false; 2619 BaseOps.push_back(BaseOp); 2620 return true; 2621 } 2622 2623 std::optional<ExtAddrMode> 2624 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2625 const TargetRegisterInfo *TRI) const { 2626 const MachineOperand *Base; // Filled with the base operand of MI. 2627 int64_t Offset; // Filled with the offset of MI. 2628 bool OffsetIsScalable; 2629 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2630 return std::nullopt; 2631 2632 if (!Base->isReg()) 2633 return std::nullopt; 2634 ExtAddrMode AM; 2635 AM.BaseReg = Base->getReg(); 2636 AM.Displacement = Offset; 2637 AM.ScaledReg = 0; 2638 AM.Scale = 0; 2639 return AM; 2640 } 2641 2642 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2643 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2644 bool &OffsetIsScalable, unsigned &Width, 2645 const TargetRegisterInfo *TRI) const { 2646 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2647 // Handle only loads/stores with base register followed by immediate offset. 2648 if (LdSt.getNumExplicitOperands() == 3) { 2649 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2650 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2651 !LdSt.getOperand(2).isImm()) 2652 return false; 2653 } else if (LdSt.getNumExplicitOperands() == 4) { 2654 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2655 if (!LdSt.getOperand(1).isReg() || 2656 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2657 !LdSt.getOperand(3).isImm()) 2658 return false; 2659 } else 2660 return false; 2661 2662 // Get the scaling factor for the instruction and set the width for the 2663 // instruction. 2664 TypeSize Scale(0U, false); 2665 int64_t Dummy1, Dummy2; 2666 2667 // If this returns false, then it's an instruction we don't want to handle. 2668 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2669 return false; 2670 2671 // Compute the offset. Offset is calculated as the immediate operand 2672 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2673 // set to 1. 2674 if (LdSt.getNumExplicitOperands() == 3) { 2675 BaseOp = &LdSt.getOperand(1); 2676 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 2677 } else { 2678 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2679 BaseOp = &LdSt.getOperand(2); 2680 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 2681 } 2682 OffsetIsScalable = Scale.isScalable(); 2683 2684 if (!BaseOp->isReg() && !BaseOp->isFI()) 2685 return false; 2686 2687 return true; 2688 } 2689 2690 MachineOperand & 2691 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2692 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2693 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2694 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2695 return OfsOp; 2696 } 2697 2698 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2699 unsigned &Width, int64_t &MinOffset, 2700 int64_t &MaxOffset) { 2701 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2702 switch (Opcode) { 2703 // Not a memory operation or something we want to handle. 2704 default: 2705 Scale = TypeSize::Fixed(0); 2706 Width = 0; 2707 MinOffset = MaxOffset = 0; 2708 return false; 2709 case AArch64::STRWpost: 2710 case AArch64::LDRWpost: 2711 Width = 32; 2712 Scale = TypeSize::Fixed(4); 2713 MinOffset = -256; 2714 MaxOffset = 255; 2715 break; 2716 case AArch64::LDURQi: 2717 case AArch64::STURQi: 2718 Width = 16; 2719 Scale = TypeSize::Fixed(1); 2720 MinOffset = -256; 2721 MaxOffset = 255; 2722 break; 2723 case AArch64::PRFUMi: 2724 case AArch64::LDURXi: 2725 case AArch64::LDURDi: 2726 case AArch64::STURXi: 2727 case AArch64::STURDi: 2728 Width = 8; 2729 Scale = TypeSize::Fixed(1); 2730 MinOffset = -256; 2731 MaxOffset = 255; 2732 break; 2733 case AArch64::LDURWi: 2734 case AArch64::LDURSi: 2735 case AArch64::LDURSWi: 2736 case AArch64::STURWi: 2737 case AArch64::STURSi: 2738 Width = 4; 2739 Scale = TypeSize::Fixed(1); 2740 MinOffset = -256; 2741 MaxOffset = 255; 2742 break; 2743 case AArch64::LDURHi: 2744 case AArch64::LDURHHi: 2745 case AArch64::LDURSHXi: 2746 case AArch64::LDURSHWi: 2747 case AArch64::STURHi: 2748 case AArch64::STURHHi: 2749 Width = 2; 2750 Scale = TypeSize::Fixed(1); 2751 MinOffset = -256; 2752 MaxOffset = 255; 2753 break; 2754 case AArch64::LDURBi: 2755 case AArch64::LDURBBi: 2756 case AArch64::LDURSBXi: 2757 case AArch64::LDURSBWi: 2758 case AArch64::STURBi: 2759 case AArch64::STURBBi: 2760 Width = 1; 2761 Scale = TypeSize::Fixed(1); 2762 MinOffset = -256; 2763 MaxOffset = 255; 2764 break; 2765 case AArch64::LDPQi: 2766 case AArch64::LDNPQi: 2767 case AArch64::STPQi: 2768 case AArch64::STNPQi: 2769 Scale = TypeSize::Fixed(16); 2770 Width = 32; 2771 MinOffset = -64; 2772 MaxOffset = 63; 2773 break; 2774 case AArch64::LDRQui: 2775 case AArch64::STRQui: 2776 Scale = TypeSize::Fixed(16); 2777 Width = 16; 2778 MinOffset = 0; 2779 MaxOffset = 4095; 2780 break; 2781 case AArch64::LDPXi: 2782 case AArch64::LDPDi: 2783 case AArch64::LDNPXi: 2784 case AArch64::LDNPDi: 2785 case AArch64::STPXi: 2786 case AArch64::STPDi: 2787 case AArch64::STNPXi: 2788 case AArch64::STNPDi: 2789 Scale = TypeSize::Fixed(8); 2790 Width = 16; 2791 MinOffset = -64; 2792 MaxOffset = 63; 2793 break; 2794 case AArch64::PRFMui: 2795 case AArch64::LDRXui: 2796 case AArch64::LDRDui: 2797 case AArch64::STRXui: 2798 case AArch64::STRDui: 2799 Scale = TypeSize::Fixed(8); 2800 Width = 8; 2801 MinOffset = 0; 2802 MaxOffset = 4095; 2803 break; 2804 case AArch64::StoreSwiftAsyncContext: 2805 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2806 Scale = TypeSize::Fixed(1); 2807 Width = 8; 2808 MinOffset = 0; 2809 MaxOffset = 4095; 2810 break; 2811 case AArch64::LDPWi: 2812 case AArch64::LDPSi: 2813 case AArch64::LDNPWi: 2814 case AArch64::LDNPSi: 2815 case AArch64::STPWi: 2816 case AArch64::STPSi: 2817 case AArch64::STNPWi: 2818 case AArch64::STNPSi: 2819 Scale = TypeSize::Fixed(4); 2820 Width = 8; 2821 MinOffset = -64; 2822 MaxOffset = 63; 2823 break; 2824 case AArch64::LDRWui: 2825 case AArch64::LDRSui: 2826 case AArch64::LDRSWui: 2827 case AArch64::STRWui: 2828 case AArch64::STRSui: 2829 Scale = TypeSize::Fixed(4); 2830 Width = 4; 2831 MinOffset = 0; 2832 MaxOffset = 4095; 2833 break; 2834 case AArch64::LDRHui: 2835 case AArch64::LDRHHui: 2836 case AArch64::LDRSHWui: 2837 case AArch64::LDRSHXui: 2838 case AArch64::STRHui: 2839 case AArch64::STRHHui: 2840 Scale = TypeSize::Fixed(2); 2841 Width = 2; 2842 MinOffset = 0; 2843 MaxOffset = 4095; 2844 break; 2845 case AArch64::LDRBui: 2846 case AArch64::LDRBBui: 2847 case AArch64::LDRSBWui: 2848 case AArch64::LDRSBXui: 2849 case AArch64::STRBui: 2850 case AArch64::STRBBui: 2851 Scale = TypeSize::Fixed(1); 2852 Width = 1; 2853 MinOffset = 0; 2854 MaxOffset = 4095; 2855 break; 2856 case AArch64::STPXpre: 2857 case AArch64::LDPXpost: 2858 case AArch64::STPDpre: 2859 case AArch64::LDPDpost: 2860 Scale = TypeSize::Fixed(8); 2861 Width = 8; 2862 MinOffset = -512; 2863 MaxOffset = 504; 2864 break; 2865 case AArch64::STPQpre: 2866 case AArch64::LDPQpost: 2867 Scale = TypeSize::Fixed(16); 2868 Width = 16; 2869 MinOffset = -1024; 2870 MaxOffset = 1008; 2871 break; 2872 case AArch64::STRXpre: 2873 case AArch64::STRDpre: 2874 case AArch64::LDRXpost: 2875 case AArch64::LDRDpost: 2876 Scale = TypeSize::Fixed(1); 2877 Width = 8; 2878 MinOffset = -256; 2879 MaxOffset = 255; 2880 break; 2881 case AArch64::STRQpre: 2882 case AArch64::LDRQpost: 2883 Scale = TypeSize::Fixed(1); 2884 Width = 16; 2885 MinOffset = -256; 2886 MaxOffset = 255; 2887 break; 2888 case AArch64::ADDG: 2889 Scale = TypeSize::Fixed(16); 2890 Width = 0; 2891 MinOffset = 0; 2892 MaxOffset = 63; 2893 break; 2894 case AArch64::TAGPstack: 2895 Scale = TypeSize::Fixed(16); 2896 Width = 0; 2897 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2898 // of 63 (not 64!). 2899 MinOffset = -63; 2900 MaxOffset = 63; 2901 break; 2902 case AArch64::LDG: 2903 case AArch64::STGi: 2904 case AArch64::STZGi: 2905 Scale = TypeSize::Fixed(16); 2906 Width = 16; 2907 MinOffset = -256; 2908 MaxOffset = 255; 2909 break; 2910 case AArch64::STR_ZZZZXI: 2911 case AArch64::LDR_ZZZZXI: 2912 Scale = TypeSize::Scalable(16); 2913 Width = SVEMaxBytesPerVector * 4; 2914 MinOffset = -256; 2915 MaxOffset = 252; 2916 break; 2917 case AArch64::STR_ZZZXI: 2918 case AArch64::LDR_ZZZXI: 2919 Scale = TypeSize::Scalable(16); 2920 Width = SVEMaxBytesPerVector * 3; 2921 MinOffset = -256; 2922 MaxOffset = 253; 2923 break; 2924 case AArch64::STR_ZZXI: 2925 case AArch64::LDR_ZZXI: 2926 Scale = TypeSize::Scalable(16); 2927 Width = SVEMaxBytesPerVector * 2; 2928 MinOffset = -256; 2929 MaxOffset = 254; 2930 break; 2931 case AArch64::LDR_PXI: 2932 case AArch64::STR_PXI: 2933 Scale = TypeSize::Scalable(2); 2934 Width = SVEMaxBytesPerVector / 8; 2935 MinOffset = -256; 2936 MaxOffset = 255; 2937 break; 2938 case AArch64::LDR_ZXI: 2939 case AArch64::STR_ZXI: 2940 Scale = TypeSize::Scalable(16); 2941 Width = SVEMaxBytesPerVector; 2942 MinOffset = -256; 2943 MaxOffset = 255; 2944 break; 2945 case AArch64::LD1B_IMM: 2946 case AArch64::LD1H_IMM: 2947 case AArch64::LD1W_IMM: 2948 case AArch64::LD1D_IMM: 2949 case AArch64::LDNT1B_ZRI: 2950 case AArch64::LDNT1H_ZRI: 2951 case AArch64::LDNT1W_ZRI: 2952 case AArch64::LDNT1D_ZRI: 2953 case AArch64::ST1B_IMM: 2954 case AArch64::ST1H_IMM: 2955 case AArch64::ST1W_IMM: 2956 case AArch64::ST1D_IMM: 2957 case AArch64::STNT1B_ZRI: 2958 case AArch64::STNT1H_ZRI: 2959 case AArch64::STNT1W_ZRI: 2960 case AArch64::STNT1D_ZRI: 2961 case AArch64::LDNF1B_IMM: 2962 case AArch64::LDNF1H_IMM: 2963 case AArch64::LDNF1W_IMM: 2964 case AArch64::LDNF1D_IMM: 2965 // A full vectors worth of data 2966 // Width = mbytes * elements 2967 Scale = TypeSize::Scalable(16); 2968 Width = SVEMaxBytesPerVector; 2969 MinOffset = -8; 2970 MaxOffset = 7; 2971 break; 2972 case AArch64::LD2B_IMM: 2973 case AArch64::LD2H_IMM: 2974 case AArch64::LD2W_IMM: 2975 case AArch64::LD2D_IMM: 2976 case AArch64::ST2B_IMM: 2977 case AArch64::ST2H_IMM: 2978 case AArch64::ST2W_IMM: 2979 case AArch64::ST2D_IMM: 2980 Scale = TypeSize::Scalable(32); 2981 Width = SVEMaxBytesPerVector * 2; 2982 MinOffset = -8; 2983 MaxOffset = 7; 2984 break; 2985 case AArch64::LD3B_IMM: 2986 case AArch64::LD3H_IMM: 2987 case AArch64::LD3W_IMM: 2988 case AArch64::LD3D_IMM: 2989 case AArch64::ST3B_IMM: 2990 case AArch64::ST3H_IMM: 2991 case AArch64::ST3W_IMM: 2992 case AArch64::ST3D_IMM: 2993 Scale = TypeSize::Scalable(48); 2994 Width = SVEMaxBytesPerVector * 3; 2995 MinOffset = -8; 2996 MaxOffset = 7; 2997 break; 2998 case AArch64::LD4B_IMM: 2999 case AArch64::LD4H_IMM: 3000 case AArch64::LD4W_IMM: 3001 case AArch64::LD4D_IMM: 3002 case AArch64::ST4B_IMM: 3003 case AArch64::ST4H_IMM: 3004 case AArch64::ST4W_IMM: 3005 case AArch64::ST4D_IMM: 3006 Scale = TypeSize::Scalable(64); 3007 Width = SVEMaxBytesPerVector * 4; 3008 MinOffset = -8; 3009 MaxOffset = 7; 3010 break; 3011 case AArch64::LD1B_H_IMM: 3012 case AArch64::LD1SB_H_IMM: 3013 case AArch64::LD1H_S_IMM: 3014 case AArch64::LD1SH_S_IMM: 3015 case AArch64::LD1W_D_IMM: 3016 case AArch64::LD1SW_D_IMM: 3017 case AArch64::ST1B_H_IMM: 3018 case AArch64::ST1H_S_IMM: 3019 case AArch64::ST1W_D_IMM: 3020 case AArch64::LDNF1B_H_IMM: 3021 case AArch64::LDNF1SB_H_IMM: 3022 case AArch64::LDNF1H_S_IMM: 3023 case AArch64::LDNF1SH_S_IMM: 3024 case AArch64::LDNF1W_D_IMM: 3025 case AArch64::LDNF1SW_D_IMM: 3026 // A half vector worth of data 3027 // Width = mbytes * elements 3028 Scale = TypeSize::Scalable(8); 3029 Width = SVEMaxBytesPerVector / 2; 3030 MinOffset = -8; 3031 MaxOffset = 7; 3032 break; 3033 case AArch64::LD1B_S_IMM: 3034 case AArch64::LD1SB_S_IMM: 3035 case AArch64::LD1H_D_IMM: 3036 case AArch64::LD1SH_D_IMM: 3037 case AArch64::ST1B_S_IMM: 3038 case AArch64::ST1H_D_IMM: 3039 case AArch64::LDNF1B_S_IMM: 3040 case AArch64::LDNF1SB_S_IMM: 3041 case AArch64::LDNF1H_D_IMM: 3042 case AArch64::LDNF1SH_D_IMM: 3043 // A quarter vector worth of data 3044 // Width = mbytes * elements 3045 Scale = TypeSize::Scalable(4); 3046 Width = SVEMaxBytesPerVector / 4; 3047 MinOffset = -8; 3048 MaxOffset = 7; 3049 break; 3050 case AArch64::LD1B_D_IMM: 3051 case AArch64::LD1SB_D_IMM: 3052 case AArch64::ST1B_D_IMM: 3053 case AArch64::LDNF1B_D_IMM: 3054 case AArch64::LDNF1SB_D_IMM: 3055 // A eighth vector worth of data 3056 // Width = mbytes * elements 3057 Scale = TypeSize::Scalable(2); 3058 Width = SVEMaxBytesPerVector / 8; 3059 MinOffset = -8; 3060 MaxOffset = 7; 3061 break; 3062 case AArch64::ST2Gi: 3063 case AArch64::STZ2Gi: 3064 Scale = TypeSize::Fixed(16); 3065 Width = 32; 3066 MinOffset = -256; 3067 MaxOffset = 255; 3068 break; 3069 case AArch64::STGPi: 3070 Scale = TypeSize::Fixed(16); 3071 Width = 16; 3072 MinOffset = -64; 3073 MaxOffset = 63; 3074 break; 3075 case AArch64::LD1RB_IMM: 3076 case AArch64::LD1RB_H_IMM: 3077 case AArch64::LD1RB_S_IMM: 3078 case AArch64::LD1RB_D_IMM: 3079 case AArch64::LD1RSB_H_IMM: 3080 case AArch64::LD1RSB_S_IMM: 3081 case AArch64::LD1RSB_D_IMM: 3082 Scale = TypeSize::Fixed(1); 3083 Width = 1; 3084 MinOffset = 0; 3085 MaxOffset = 63; 3086 break; 3087 case AArch64::LD1RH_IMM: 3088 case AArch64::LD1RH_S_IMM: 3089 case AArch64::LD1RH_D_IMM: 3090 case AArch64::LD1RSH_S_IMM: 3091 case AArch64::LD1RSH_D_IMM: 3092 Scale = TypeSize::Fixed(2); 3093 Width = 2; 3094 MinOffset = 0; 3095 MaxOffset = 63; 3096 break; 3097 case AArch64::LD1RW_IMM: 3098 case AArch64::LD1RW_D_IMM: 3099 case AArch64::LD1RSW_IMM: 3100 Scale = TypeSize::Fixed(4); 3101 Width = 4; 3102 MinOffset = 0; 3103 MaxOffset = 63; 3104 break; 3105 case AArch64::LD1RD_IMM: 3106 Scale = TypeSize::Fixed(8); 3107 Width = 8; 3108 MinOffset = 0; 3109 MaxOffset = 63; 3110 break; 3111 } 3112 3113 return true; 3114 } 3115 3116 // Scaling factor for unscaled load or store. 3117 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3118 switch (Opc) { 3119 default: 3120 llvm_unreachable("Opcode has unknown scale!"); 3121 case AArch64::LDRBBui: 3122 case AArch64::LDURBBi: 3123 case AArch64::LDRSBWui: 3124 case AArch64::LDURSBWi: 3125 case AArch64::STRBBui: 3126 case AArch64::STURBBi: 3127 return 1; 3128 case AArch64::LDRHHui: 3129 case AArch64::LDURHHi: 3130 case AArch64::LDRSHWui: 3131 case AArch64::LDURSHWi: 3132 case AArch64::STRHHui: 3133 case AArch64::STURHHi: 3134 return 2; 3135 case AArch64::LDRSui: 3136 case AArch64::LDURSi: 3137 case AArch64::LDRSpre: 3138 case AArch64::LDRSWui: 3139 case AArch64::LDURSWi: 3140 case AArch64::LDRSWpre: 3141 case AArch64::LDRWpre: 3142 case AArch64::LDRWui: 3143 case AArch64::LDURWi: 3144 case AArch64::STRSui: 3145 case AArch64::STURSi: 3146 case AArch64::STRSpre: 3147 case AArch64::STRWui: 3148 case AArch64::STURWi: 3149 case AArch64::STRWpre: 3150 case AArch64::LDPSi: 3151 case AArch64::LDPSWi: 3152 case AArch64::LDPWi: 3153 case AArch64::STPSi: 3154 case AArch64::STPWi: 3155 return 4; 3156 case AArch64::LDRDui: 3157 case AArch64::LDURDi: 3158 case AArch64::LDRDpre: 3159 case AArch64::LDRXui: 3160 case AArch64::LDURXi: 3161 case AArch64::LDRXpre: 3162 case AArch64::STRDui: 3163 case AArch64::STURDi: 3164 case AArch64::STRDpre: 3165 case AArch64::STRXui: 3166 case AArch64::STURXi: 3167 case AArch64::STRXpre: 3168 case AArch64::LDPDi: 3169 case AArch64::LDPXi: 3170 case AArch64::STPDi: 3171 case AArch64::STPXi: 3172 return 8; 3173 case AArch64::LDRQui: 3174 case AArch64::LDURQi: 3175 case AArch64::STRQui: 3176 case AArch64::STURQi: 3177 case AArch64::STRQpre: 3178 case AArch64::LDPQi: 3179 case AArch64::LDRQpre: 3180 case AArch64::STPQi: 3181 case AArch64::STGi: 3182 case AArch64::STZGi: 3183 case AArch64::ST2Gi: 3184 case AArch64::STZ2Gi: 3185 case AArch64::STGPi: 3186 return 16; 3187 } 3188 } 3189 3190 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 3191 switch (MI.getOpcode()) { 3192 default: 3193 return false; 3194 case AArch64::LDRWpre: 3195 case AArch64::LDRXpre: 3196 case AArch64::LDRSWpre: 3197 case AArch64::LDRSpre: 3198 case AArch64::LDRDpre: 3199 case AArch64::LDRQpre: 3200 return true; 3201 } 3202 } 3203 3204 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3205 switch (MI.getOpcode()) { 3206 default: 3207 return false; 3208 case AArch64::STRWpre: 3209 case AArch64::STRXpre: 3210 case AArch64::STRSpre: 3211 case AArch64::STRDpre: 3212 case AArch64::STRQpre: 3213 return true; 3214 } 3215 } 3216 3217 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3218 return isPreLd(MI) || isPreSt(MI); 3219 } 3220 3221 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 3222 switch (MI.getOpcode()) { 3223 default: 3224 return false; 3225 case AArch64::LDPSi: 3226 case AArch64::LDPSWi: 3227 case AArch64::LDPDi: 3228 case AArch64::LDPQi: 3229 case AArch64::LDPWi: 3230 case AArch64::LDPXi: 3231 case AArch64::STPSi: 3232 case AArch64::STPDi: 3233 case AArch64::STPQi: 3234 case AArch64::STPWi: 3235 case AArch64::STPXi: 3236 case AArch64::STGPi: 3237 return true; 3238 } 3239 } 3240 3241 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 3242 unsigned Idx = 3243 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 3244 : 1; 3245 return MI.getOperand(Idx); 3246 } 3247 3248 const MachineOperand & 3249 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 3250 unsigned Idx = 3251 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 3252 : 2; 3253 return MI.getOperand(Idx); 3254 } 3255 3256 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 3257 Register Reg) { 3258 if (MI.getParent() == nullptr) 3259 return nullptr; 3260 const MachineFunction *MF = MI.getParent()->getParent(); 3261 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 3262 } 3263 3264 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 3265 auto IsHFPR = [&](const MachineOperand &Op) { 3266 if (!Op.isReg()) 3267 return false; 3268 auto Reg = Op.getReg(); 3269 if (Reg.isPhysical()) 3270 return AArch64::FPR16RegClass.contains(Reg); 3271 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3272 return TRC == &AArch64::FPR16RegClass || 3273 TRC == &AArch64::FPR16_loRegClass; 3274 }; 3275 return llvm::any_of(MI.operands(), IsHFPR); 3276 } 3277 3278 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 3279 auto IsQFPR = [&](const MachineOperand &Op) { 3280 if (!Op.isReg()) 3281 return false; 3282 auto Reg = Op.getReg(); 3283 if (Reg.isPhysical()) 3284 return AArch64::FPR128RegClass.contains(Reg); 3285 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3286 return TRC == &AArch64::FPR128RegClass || 3287 TRC == &AArch64::FPR128_loRegClass; 3288 }; 3289 return llvm::any_of(MI.operands(), IsQFPR); 3290 } 3291 3292 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 3293 auto IsFPR = [&](const MachineOperand &Op) { 3294 if (!Op.isReg()) 3295 return false; 3296 auto Reg = Op.getReg(); 3297 if (Reg.isPhysical()) 3298 return AArch64::FPR128RegClass.contains(Reg) || 3299 AArch64::FPR64RegClass.contains(Reg) || 3300 AArch64::FPR32RegClass.contains(Reg) || 3301 AArch64::FPR16RegClass.contains(Reg) || 3302 AArch64::FPR8RegClass.contains(Reg); 3303 3304 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3305 return TRC == &AArch64::FPR128RegClass || 3306 TRC == &AArch64::FPR128_loRegClass || 3307 TRC == &AArch64::FPR64RegClass || 3308 TRC == &AArch64::FPR64_loRegClass || 3309 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 3310 TRC == &AArch64::FPR8RegClass; 3311 }; 3312 return llvm::any_of(MI.operands(), IsFPR); 3313 } 3314 3315 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3316 // scaled. 3317 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3318 int Scale = AArch64InstrInfo::getMemScale(Opc); 3319 3320 // If the byte-offset isn't a multiple of the stride, we can't scale this 3321 // offset. 3322 if (Offset % Scale != 0) 3323 return false; 3324 3325 // Convert the byte-offset used by unscaled into an "element" offset used 3326 // by the scaled pair load/store instructions. 3327 Offset /= Scale; 3328 return true; 3329 } 3330 3331 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3332 if (FirstOpc == SecondOpc) 3333 return true; 3334 // We can also pair sign-ext and zero-ext instructions. 3335 switch (FirstOpc) { 3336 default: 3337 return false; 3338 case AArch64::LDRWui: 3339 case AArch64::LDURWi: 3340 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3341 case AArch64::LDRSWui: 3342 case AArch64::LDURSWi: 3343 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3344 } 3345 // These instructions can't be paired based on their opcodes. 3346 return false; 3347 } 3348 3349 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3350 int64_t Offset1, unsigned Opcode1, int FI2, 3351 int64_t Offset2, unsigned Opcode2) { 3352 // Accesses through fixed stack object frame indices may access a different 3353 // fixed stack slot. Check that the object offsets + offsets match. 3354 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3355 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3356 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3357 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3358 // Convert to scaled object offsets. 3359 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3360 if (ObjectOffset1 % Scale1 != 0) 3361 return false; 3362 ObjectOffset1 /= Scale1; 3363 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3364 if (ObjectOffset2 % Scale2 != 0) 3365 return false; 3366 ObjectOffset2 /= Scale2; 3367 ObjectOffset1 += Offset1; 3368 ObjectOffset2 += Offset2; 3369 return ObjectOffset1 + 1 == ObjectOffset2; 3370 } 3371 3372 return FI1 == FI2; 3373 } 3374 3375 /// Detect opportunities for ldp/stp formation. 3376 /// 3377 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3378 bool AArch64InstrInfo::shouldClusterMemOps( 3379 ArrayRef<const MachineOperand *> BaseOps1, 3380 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3381 unsigned NumBytes) const { 3382 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3383 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3384 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3385 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3386 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3387 if (BaseOp1.getType() != BaseOp2.getType()) 3388 return false; 3389 3390 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3391 "Only base registers and frame indices are supported."); 3392 3393 // Check for both base regs and base FI. 3394 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3395 return false; 3396 3397 // Only cluster up to a single pair. 3398 if (NumLoads > 2) 3399 return false; 3400 3401 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3402 return false; 3403 3404 // Can we pair these instructions based on their opcodes? 3405 unsigned FirstOpc = FirstLdSt.getOpcode(); 3406 unsigned SecondOpc = SecondLdSt.getOpcode(); 3407 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3408 return false; 3409 3410 // Can't merge volatiles or load/stores that have a hint to avoid pair 3411 // formation, for example. 3412 if (!isCandidateToMergeOrPair(FirstLdSt) || 3413 !isCandidateToMergeOrPair(SecondLdSt)) 3414 return false; 3415 3416 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3417 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3418 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3419 return false; 3420 3421 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3422 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3423 return false; 3424 3425 // Pairwise instructions have a 7-bit signed offset field. 3426 if (Offset1 > 63 || Offset1 < -64) 3427 return false; 3428 3429 // The caller should already have ordered First/SecondLdSt by offset. 3430 // Note: except for non-equal frame index bases 3431 if (BaseOp1.isFI()) { 3432 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3433 "Caller should have ordered offsets."); 3434 3435 const MachineFrameInfo &MFI = 3436 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3437 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3438 BaseOp2.getIndex(), Offset2, SecondOpc); 3439 } 3440 3441 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3442 3443 return Offset1 + 1 == Offset2; 3444 } 3445 3446 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3447 unsigned Reg, unsigned SubIdx, 3448 unsigned State, 3449 const TargetRegisterInfo *TRI) { 3450 if (!SubIdx) 3451 return MIB.addReg(Reg, State); 3452 3453 if (Register::isPhysicalRegister(Reg)) 3454 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3455 return MIB.addReg(Reg, State, SubIdx); 3456 } 3457 3458 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3459 unsigned NumRegs) { 3460 // We really want the positive remainder mod 32 here, that happens to be 3461 // easily obtainable with a mask. 3462 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3463 } 3464 3465 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3466 MachineBasicBlock::iterator I, 3467 const DebugLoc &DL, MCRegister DestReg, 3468 MCRegister SrcReg, bool KillSrc, 3469 unsigned Opcode, 3470 ArrayRef<unsigned> Indices) const { 3471 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3472 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3473 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3474 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3475 unsigned NumRegs = Indices.size(); 3476 3477 int SubReg = 0, End = NumRegs, Incr = 1; 3478 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3479 SubReg = NumRegs - 1; 3480 End = -1; 3481 Incr = -1; 3482 } 3483 3484 for (; SubReg != End; SubReg += Incr) { 3485 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3486 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3487 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3488 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3489 } 3490 } 3491 3492 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3493 MachineBasicBlock::iterator I, 3494 DebugLoc DL, unsigned DestReg, 3495 unsigned SrcReg, bool KillSrc, 3496 unsigned Opcode, unsigned ZeroReg, 3497 llvm::ArrayRef<unsigned> Indices) const { 3498 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3499 unsigned NumRegs = Indices.size(); 3500 3501 #ifndef NDEBUG 3502 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3503 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3504 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3505 "GPR reg sequences should not be able to overlap"); 3506 #endif 3507 3508 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3509 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3510 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3511 MIB.addReg(ZeroReg); 3512 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3513 MIB.addImm(0); 3514 } 3515 } 3516 3517 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3518 MachineBasicBlock::iterator I, 3519 const DebugLoc &DL, MCRegister DestReg, 3520 MCRegister SrcReg, bool KillSrc) const { 3521 if (AArch64::GPR32spRegClass.contains(DestReg) && 3522 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3523 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3524 3525 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3526 // If either operand is WSP, expand to ADD #0. 3527 if (Subtarget.hasZeroCycleRegMove()) { 3528 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3529 MCRegister DestRegX = TRI->getMatchingSuperReg( 3530 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3531 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3532 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3533 // This instruction is reading and writing X registers. This may upset 3534 // the register scavenger and machine verifier, so we need to indicate 3535 // that we are reading an undefined value from SrcRegX, but a proper 3536 // value from SrcReg. 3537 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3538 .addReg(SrcRegX, RegState::Undef) 3539 .addImm(0) 3540 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3541 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3542 } else { 3543 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3544 .addReg(SrcReg, getKillRegState(KillSrc)) 3545 .addImm(0) 3546 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3547 } 3548 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3549 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3550 .addImm(0) 3551 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3552 } else { 3553 if (Subtarget.hasZeroCycleRegMove()) { 3554 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3555 MCRegister DestRegX = TRI->getMatchingSuperReg( 3556 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3557 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3558 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3559 // This instruction is reading and writing X registers. This may upset 3560 // the register scavenger and machine verifier, so we need to indicate 3561 // that we are reading an undefined value from SrcRegX, but a proper 3562 // value from SrcReg. 3563 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3564 .addReg(AArch64::XZR) 3565 .addReg(SrcRegX, RegState::Undef) 3566 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3567 } else { 3568 // Otherwise, expand to ORR WZR. 3569 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3570 .addReg(AArch64::WZR) 3571 .addReg(SrcReg, getKillRegState(KillSrc)); 3572 } 3573 } 3574 return; 3575 } 3576 3577 // Copy a Predicate register by ORRing with itself. 3578 if (AArch64::PPRRegClass.contains(DestReg) && 3579 AArch64::PPRRegClass.contains(SrcReg)) { 3580 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3581 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3582 .addReg(SrcReg) // Pg 3583 .addReg(SrcReg) 3584 .addReg(SrcReg, getKillRegState(KillSrc)); 3585 return; 3586 } 3587 3588 // Copy a Z register by ORRing with itself. 3589 if (AArch64::ZPRRegClass.contains(DestReg) && 3590 AArch64::ZPRRegClass.contains(SrcReg)) { 3591 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3592 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3593 .addReg(SrcReg) 3594 .addReg(SrcReg, getKillRegState(KillSrc)); 3595 return; 3596 } 3597 3598 // Copy a Z register pair by copying the individual sub-registers. 3599 if (AArch64::ZPR2RegClass.contains(DestReg) && 3600 AArch64::ZPR2RegClass.contains(SrcReg)) { 3601 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3602 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3603 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3604 Indices); 3605 return; 3606 } 3607 3608 // Copy a Z register triple by copying the individual sub-registers. 3609 if (AArch64::ZPR3RegClass.contains(DestReg) && 3610 AArch64::ZPR3RegClass.contains(SrcReg)) { 3611 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3612 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3613 AArch64::zsub2}; 3614 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3615 Indices); 3616 return; 3617 } 3618 3619 // Copy a Z register quad by copying the individual sub-registers. 3620 if (AArch64::ZPR4RegClass.contains(DestReg) && 3621 AArch64::ZPR4RegClass.contains(SrcReg)) { 3622 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3623 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3624 AArch64::zsub2, AArch64::zsub3}; 3625 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3626 Indices); 3627 return; 3628 } 3629 3630 if (AArch64::GPR64spRegClass.contains(DestReg) && 3631 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3632 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3633 // If either operand is SP, expand to ADD #0. 3634 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3635 .addReg(SrcReg, getKillRegState(KillSrc)) 3636 .addImm(0) 3637 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3638 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3639 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3640 .addImm(0) 3641 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3642 } else { 3643 // Otherwise, expand to ORR XZR. 3644 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3645 .addReg(AArch64::XZR) 3646 .addReg(SrcReg, getKillRegState(KillSrc)); 3647 } 3648 return; 3649 } 3650 3651 // Copy a DDDD register quad by copying the individual sub-registers. 3652 if (AArch64::DDDDRegClass.contains(DestReg) && 3653 AArch64::DDDDRegClass.contains(SrcReg)) { 3654 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3655 AArch64::dsub2, AArch64::dsub3}; 3656 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3657 Indices); 3658 return; 3659 } 3660 3661 // Copy a DDD register triple by copying the individual sub-registers. 3662 if (AArch64::DDDRegClass.contains(DestReg) && 3663 AArch64::DDDRegClass.contains(SrcReg)) { 3664 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3665 AArch64::dsub2}; 3666 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3667 Indices); 3668 return; 3669 } 3670 3671 // Copy a DD register pair by copying the individual sub-registers. 3672 if (AArch64::DDRegClass.contains(DestReg) && 3673 AArch64::DDRegClass.contains(SrcReg)) { 3674 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3675 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3676 Indices); 3677 return; 3678 } 3679 3680 // Copy a QQQQ register quad by copying the individual sub-registers. 3681 if (AArch64::QQQQRegClass.contains(DestReg) && 3682 AArch64::QQQQRegClass.contains(SrcReg)) { 3683 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3684 AArch64::qsub2, AArch64::qsub3}; 3685 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3686 Indices); 3687 return; 3688 } 3689 3690 // Copy a QQQ register triple by copying the individual sub-registers. 3691 if (AArch64::QQQRegClass.contains(DestReg) && 3692 AArch64::QQQRegClass.contains(SrcReg)) { 3693 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3694 AArch64::qsub2}; 3695 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3696 Indices); 3697 return; 3698 } 3699 3700 // Copy a QQ register pair by copying the individual sub-registers. 3701 if (AArch64::QQRegClass.contains(DestReg) && 3702 AArch64::QQRegClass.contains(SrcReg)) { 3703 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3704 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3705 Indices); 3706 return; 3707 } 3708 3709 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3710 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3711 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3712 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3713 AArch64::XZR, Indices); 3714 return; 3715 } 3716 3717 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3718 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3719 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3720 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3721 AArch64::WZR, Indices); 3722 return; 3723 } 3724 3725 if (AArch64::FPR128RegClass.contains(DestReg) && 3726 AArch64::FPR128RegClass.contains(SrcReg)) { 3727 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) 3728 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 3729 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 3730 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 3731 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 3732 else if (Subtarget.hasNEON()) 3733 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3734 .addReg(SrcReg) 3735 .addReg(SrcReg, getKillRegState(KillSrc)); 3736 else { 3737 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3738 .addReg(AArch64::SP, RegState::Define) 3739 .addReg(SrcReg, getKillRegState(KillSrc)) 3740 .addReg(AArch64::SP) 3741 .addImm(-16); 3742 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3743 .addReg(AArch64::SP, RegState::Define) 3744 .addReg(DestReg, RegState::Define) 3745 .addReg(AArch64::SP) 3746 .addImm(16); 3747 } 3748 return; 3749 } 3750 3751 if (AArch64::FPR64RegClass.contains(DestReg) && 3752 AArch64::FPR64RegClass.contains(SrcReg)) { 3753 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3754 .addReg(SrcReg, getKillRegState(KillSrc)); 3755 return; 3756 } 3757 3758 if (AArch64::FPR32RegClass.contains(DestReg) && 3759 AArch64::FPR32RegClass.contains(SrcReg)) { 3760 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3761 .addReg(SrcReg, getKillRegState(KillSrc)); 3762 return; 3763 } 3764 3765 if (AArch64::FPR16RegClass.contains(DestReg) && 3766 AArch64::FPR16RegClass.contains(SrcReg)) { 3767 DestReg = 3768 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 3769 SrcReg = 3770 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 3771 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3772 .addReg(SrcReg, getKillRegState(KillSrc)); 3773 return; 3774 } 3775 3776 if (AArch64::FPR8RegClass.contains(DestReg) && 3777 AArch64::FPR8RegClass.contains(SrcReg)) { 3778 DestReg = 3779 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 3780 SrcReg = 3781 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 3782 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3783 .addReg(SrcReg, getKillRegState(KillSrc)); 3784 return; 3785 } 3786 3787 // Copies between GPR64 and FPR64. 3788 if (AArch64::FPR64RegClass.contains(DestReg) && 3789 AArch64::GPR64RegClass.contains(SrcReg)) { 3790 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3791 .addReg(SrcReg, getKillRegState(KillSrc)); 3792 return; 3793 } 3794 if (AArch64::GPR64RegClass.contains(DestReg) && 3795 AArch64::FPR64RegClass.contains(SrcReg)) { 3796 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3797 .addReg(SrcReg, getKillRegState(KillSrc)); 3798 return; 3799 } 3800 // Copies between GPR32 and FPR32. 3801 if (AArch64::FPR32RegClass.contains(DestReg) && 3802 AArch64::GPR32RegClass.contains(SrcReg)) { 3803 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3804 .addReg(SrcReg, getKillRegState(KillSrc)); 3805 return; 3806 } 3807 if (AArch64::GPR32RegClass.contains(DestReg) && 3808 AArch64::FPR32RegClass.contains(SrcReg)) { 3809 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3810 .addReg(SrcReg, getKillRegState(KillSrc)); 3811 return; 3812 } 3813 3814 if (DestReg == AArch64::NZCV) { 3815 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3816 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3817 .addImm(AArch64SysReg::NZCV) 3818 .addReg(SrcReg, getKillRegState(KillSrc)) 3819 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3820 return; 3821 } 3822 3823 if (SrcReg == AArch64::NZCV) { 3824 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3825 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3826 .addImm(AArch64SysReg::NZCV) 3827 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3828 return; 3829 } 3830 3831 #ifndef NDEBUG 3832 const TargetRegisterInfo &TRI = getRegisterInfo(); 3833 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 3834 << TRI.getRegAsmName(SrcReg) << "\n"; 3835 #endif 3836 llvm_unreachable("unimplemented reg-to-reg copy"); 3837 } 3838 3839 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3840 MachineBasicBlock &MBB, 3841 MachineBasicBlock::iterator InsertBefore, 3842 const MCInstrDesc &MCID, 3843 Register SrcReg, bool IsKill, 3844 unsigned SubIdx0, unsigned SubIdx1, int FI, 3845 MachineMemOperand *MMO) { 3846 Register SrcReg0 = SrcReg; 3847 Register SrcReg1 = SrcReg; 3848 if (SrcReg.isPhysical()) { 3849 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3850 SubIdx0 = 0; 3851 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3852 SubIdx1 = 0; 3853 } 3854 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3855 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3856 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3857 .addFrameIndex(FI) 3858 .addImm(0) 3859 .addMemOperand(MMO); 3860 } 3861 3862 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 3863 MachineBasicBlock::iterator MBBI, 3864 Register SrcReg, bool isKill, int FI, 3865 const TargetRegisterClass *RC, 3866 const TargetRegisterInfo *TRI, 3867 Register VReg) const { 3868 MachineFunction &MF = *MBB.getParent(); 3869 MachineFrameInfo &MFI = MF.getFrameInfo(); 3870 3871 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3872 MachineMemOperand *MMO = 3873 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3874 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3875 unsigned Opc = 0; 3876 bool Offset = true; 3877 unsigned StackID = TargetStackID::Default; 3878 switch (TRI->getSpillSize(*RC)) { 3879 case 1: 3880 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3881 Opc = AArch64::STRBui; 3882 break; 3883 case 2: 3884 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3885 Opc = AArch64::STRHui; 3886 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3887 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3888 Opc = AArch64::STR_PXI; 3889 StackID = TargetStackID::ScalableVector; 3890 } 3891 break; 3892 case 4: 3893 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3894 Opc = AArch64::STRWui; 3895 if (SrcReg.isVirtual()) 3896 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3897 else 3898 assert(SrcReg != AArch64::WSP); 3899 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3900 Opc = AArch64::STRSui; 3901 break; 3902 case 8: 3903 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3904 Opc = AArch64::STRXui; 3905 if (SrcReg.isVirtual()) 3906 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3907 else 3908 assert(SrcReg != AArch64::SP); 3909 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3910 Opc = AArch64::STRDui; 3911 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3912 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3913 get(AArch64::STPWi), SrcReg, isKill, 3914 AArch64::sube32, AArch64::subo32, FI, MMO); 3915 return; 3916 } 3917 break; 3918 case 16: 3919 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3920 Opc = AArch64::STRQui; 3921 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3922 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3923 Opc = AArch64::ST1Twov1d; 3924 Offset = false; 3925 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3926 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3927 get(AArch64::STPXi), SrcReg, isKill, 3928 AArch64::sube64, AArch64::subo64, FI, MMO); 3929 return; 3930 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3931 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3932 Opc = AArch64::STR_ZXI; 3933 StackID = TargetStackID::ScalableVector; 3934 } 3935 break; 3936 case 24: 3937 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3938 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3939 Opc = AArch64::ST1Threev1d; 3940 Offset = false; 3941 } 3942 break; 3943 case 32: 3944 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3945 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3946 Opc = AArch64::ST1Fourv1d; 3947 Offset = false; 3948 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3949 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3950 Opc = AArch64::ST1Twov2d; 3951 Offset = false; 3952 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3953 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3954 Opc = AArch64::STR_ZZXI; 3955 StackID = TargetStackID::ScalableVector; 3956 } 3957 break; 3958 case 48: 3959 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3960 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3961 Opc = AArch64::ST1Threev2d; 3962 Offset = false; 3963 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3964 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3965 Opc = AArch64::STR_ZZZXI; 3966 StackID = TargetStackID::ScalableVector; 3967 } 3968 break; 3969 case 64: 3970 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3971 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3972 Opc = AArch64::ST1Fourv2d; 3973 Offset = false; 3974 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3975 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3976 Opc = AArch64::STR_ZZZZXI; 3977 StackID = TargetStackID::ScalableVector; 3978 } 3979 break; 3980 } 3981 assert(Opc && "Unknown register class"); 3982 MFI.setStackID(FI, StackID); 3983 3984 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3985 .addReg(SrcReg, getKillRegState(isKill)) 3986 .addFrameIndex(FI); 3987 3988 if (Offset) 3989 MI.addImm(0); 3990 MI.addMemOperand(MMO); 3991 } 3992 3993 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3994 MachineBasicBlock &MBB, 3995 MachineBasicBlock::iterator InsertBefore, 3996 const MCInstrDesc &MCID, 3997 Register DestReg, unsigned SubIdx0, 3998 unsigned SubIdx1, int FI, 3999 MachineMemOperand *MMO) { 4000 Register DestReg0 = DestReg; 4001 Register DestReg1 = DestReg; 4002 bool IsUndef = true; 4003 if (DestReg.isPhysical()) { 4004 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 4005 SubIdx0 = 0; 4006 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 4007 SubIdx1 = 0; 4008 IsUndef = false; 4009 } 4010 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4011 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 4012 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 4013 .addFrameIndex(FI) 4014 .addImm(0) 4015 .addMemOperand(MMO); 4016 } 4017 4018 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 4019 MachineBasicBlock::iterator MBBI, 4020 Register DestReg, int FI, 4021 const TargetRegisterClass *RC, 4022 const TargetRegisterInfo *TRI, 4023 Register VReg) const { 4024 MachineFunction &MF = *MBB.getParent(); 4025 MachineFrameInfo &MFI = MF.getFrameInfo(); 4026 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4027 MachineMemOperand *MMO = 4028 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 4029 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4030 4031 unsigned Opc = 0; 4032 bool Offset = true; 4033 unsigned StackID = TargetStackID::Default; 4034 switch (TRI->getSpillSize(*RC)) { 4035 case 1: 4036 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4037 Opc = AArch64::LDRBui; 4038 break; 4039 case 2: 4040 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4041 Opc = AArch64::LDRHui; 4042 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4043 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4044 Opc = AArch64::LDR_PXI; 4045 StackID = TargetStackID::ScalableVector; 4046 } 4047 break; 4048 case 4: 4049 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4050 Opc = AArch64::LDRWui; 4051 if (DestReg.isVirtual()) 4052 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 4053 else 4054 assert(DestReg != AArch64::WSP); 4055 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4056 Opc = AArch64::LDRSui; 4057 break; 4058 case 8: 4059 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4060 Opc = AArch64::LDRXui; 4061 if (DestReg.isVirtual()) 4062 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 4063 else 4064 assert(DestReg != AArch64::SP); 4065 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4066 Opc = AArch64::LDRDui; 4067 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4068 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4069 get(AArch64::LDPWi), DestReg, AArch64::sube32, 4070 AArch64::subo32, FI, MMO); 4071 return; 4072 } 4073 break; 4074 case 16: 4075 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4076 Opc = AArch64::LDRQui; 4077 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4078 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4079 Opc = AArch64::LD1Twov1d; 4080 Offset = false; 4081 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4082 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4083 get(AArch64::LDPXi), DestReg, AArch64::sube64, 4084 AArch64::subo64, FI, MMO); 4085 return; 4086 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4087 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4088 Opc = AArch64::LDR_ZXI; 4089 StackID = TargetStackID::ScalableVector; 4090 } 4091 break; 4092 case 24: 4093 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4094 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4095 Opc = AArch64::LD1Threev1d; 4096 Offset = false; 4097 } 4098 break; 4099 case 32: 4100 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4101 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4102 Opc = AArch64::LD1Fourv1d; 4103 Offset = false; 4104 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4105 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4106 Opc = AArch64::LD1Twov2d; 4107 Offset = false; 4108 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 4109 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4110 Opc = AArch64::LDR_ZZXI; 4111 StackID = TargetStackID::ScalableVector; 4112 } 4113 break; 4114 case 48: 4115 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4116 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4117 Opc = AArch64::LD1Threev2d; 4118 Offset = false; 4119 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4120 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4121 Opc = AArch64::LDR_ZZZXI; 4122 StackID = TargetStackID::ScalableVector; 4123 } 4124 break; 4125 case 64: 4126 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4127 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4128 Opc = AArch64::LD1Fourv2d; 4129 Offset = false; 4130 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 4131 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4132 Opc = AArch64::LDR_ZZZZXI; 4133 StackID = TargetStackID::ScalableVector; 4134 } 4135 break; 4136 } 4137 4138 assert(Opc && "Unknown register class"); 4139 MFI.setStackID(FI, StackID); 4140 4141 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4142 .addReg(DestReg, getDefRegState(true)) 4143 .addFrameIndex(FI); 4144 if (Offset) 4145 MI.addImm(0); 4146 MI.addMemOperand(MMO); 4147 } 4148 4149 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 4150 const MachineInstr &UseMI, 4151 const TargetRegisterInfo *TRI) { 4152 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 4153 UseMI.getIterator()), 4154 [TRI](const MachineInstr &I) { 4155 return I.modifiesRegister(AArch64::NZCV, TRI) || 4156 I.readsRegister(AArch64::NZCV, TRI); 4157 }); 4158 } 4159 4160 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4161 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 4162 // The smallest scalable element supported by scaled SVE addressing 4163 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4164 // byte offset must always be a multiple of 2. 4165 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4166 4167 // VGSized offsets are divided by '2', because the VG register is the 4168 // the number of 64bit granules as opposed to 128bit vector chunks, 4169 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 4170 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 4171 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 4172 ByteSized = Offset.getFixed(); 4173 VGSized = Offset.getScalable() / 2; 4174 } 4175 4176 /// Returns the offset in parts to which this frame offset can be 4177 /// decomposed for the purpose of describing a frame offset. 4178 /// For non-scalable offsets this is simply its byte size. 4179 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4180 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 4181 int64_t &NumDataVectors) { 4182 // The smallest scalable element supported by scaled SVE addressing 4183 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4184 // byte offset must always be a multiple of 2. 4185 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4186 4187 NumBytes = Offset.getFixed(); 4188 NumDataVectors = 0; 4189 NumPredicateVectors = Offset.getScalable() / 2; 4190 // This method is used to get the offsets to adjust the frame offset. 4191 // If the function requires ADDPL to be used and needs more than two ADDPL 4192 // instructions, part of the offset is folded into NumDataVectors so that it 4193 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 4194 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 4195 NumPredicateVectors > 62) { 4196 NumDataVectors = NumPredicateVectors / 8; 4197 NumPredicateVectors -= NumDataVectors * 8; 4198 } 4199 } 4200 4201 // Convenience function to create a DWARF expression for 4202 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 4203 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 4204 int NumVGScaledBytes, unsigned VG, 4205 llvm::raw_string_ostream &Comment) { 4206 uint8_t buffer[16]; 4207 4208 if (NumBytes) { 4209 Expr.push_back(dwarf::DW_OP_consts); 4210 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 4211 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4212 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 4213 } 4214 4215 if (NumVGScaledBytes) { 4216 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 4217 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 4218 4219 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 4220 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 4221 Expr.push_back(0); 4222 4223 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 4224 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4225 4226 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 4227 << std::abs(NumVGScaledBytes) << " * VG"; 4228 } 4229 } 4230 4231 // Creates an MCCFIInstruction: 4232 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 4233 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 4234 unsigned Reg, 4235 const StackOffset &Offset) { 4236 int64_t NumBytes, NumVGScaledBytes; 4237 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 4238 NumVGScaledBytes); 4239 std::string CommentBuffer; 4240 llvm::raw_string_ostream Comment(CommentBuffer); 4241 4242 if (Reg == AArch64::SP) 4243 Comment << "sp"; 4244 else if (Reg == AArch64::FP) 4245 Comment << "fp"; 4246 else 4247 Comment << printReg(Reg, &TRI); 4248 4249 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 4250 SmallString<64> Expr; 4251 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4252 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 4253 Expr.push_back(0); 4254 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 4255 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4256 4257 // Wrap this into DW_CFA_def_cfa. 4258 SmallString<64> DefCfaExpr; 4259 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 4260 uint8_t buffer[16]; 4261 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 4262 DefCfaExpr.append(Expr.str()); 4263 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 4264 Comment.str()); 4265 } 4266 4267 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 4268 unsigned FrameReg, unsigned Reg, 4269 const StackOffset &Offset, 4270 bool LastAdjustmentWasScalable) { 4271 if (Offset.getScalable()) 4272 return createDefCFAExpression(TRI, Reg, Offset); 4273 4274 if (FrameReg == Reg && !LastAdjustmentWasScalable) 4275 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 4276 4277 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4278 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 4279 } 4280 4281 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 4282 unsigned Reg, 4283 const StackOffset &OffsetFromDefCFA) { 4284 int64_t NumBytes, NumVGScaledBytes; 4285 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4286 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 4287 4288 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4289 4290 // Non-scalable offsets can use DW_CFA_offset directly. 4291 if (!NumVGScaledBytes) 4292 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 4293 4294 std::string CommentBuffer; 4295 llvm::raw_string_ostream Comment(CommentBuffer); 4296 Comment << printReg(Reg, &TRI) << " @ cfa"; 4297 4298 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 4299 SmallString<64> OffsetExpr; 4300 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 4301 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4302 4303 // Wrap this into DW_CFA_expression 4304 SmallString<64> CfaExpr; 4305 CfaExpr.push_back(dwarf::DW_CFA_expression); 4306 uint8_t buffer[16]; 4307 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 4308 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 4309 CfaExpr.append(OffsetExpr.str()); 4310 4311 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 4312 Comment.str()); 4313 } 4314 4315 // Helper function to emit a frame offset adjustment from a given 4316 // pointer (SrcReg), stored into DestReg. This function is explicit 4317 // in that it requires the opcode. 4318 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 4319 MachineBasicBlock::iterator MBBI, 4320 const DebugLoc &DL, unsigned DestReg, 4321 unsigned SrcReg, int64_t Offset, unsigned Opc, 4322 const TargetInstrInfo *TII, 4323 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 4324 bool *HasWinCFI, bool EmitCFAOffset, 4325 StackOffset CFAOffset, unsigned FrameReg) { 4326 int Sign = 1; 4327 unsigned MaxEncoding, ShiftSize; 4328 switch (Opc) { 4329 case AArch64::ADDXri: 4330 case AArch64::ADDSXri: 4331 case AArch64::SUBXri: 4332 case AArch64::SUBSXri: 4333 MaxEncoding = 0xfff; 4334 ShiftSize = 12; 4335 break; 4336 case AArch64::ADDVL_XXI: 4337 case AArch64::ADDPL_XXI: 4338 case AArch64::ADDSVL_XXI: 4339 case AArch64::ADDSPL_XXI: 4340 MaxEncoding = 31; 4341 ShiftSize = 0; 4342 if (Offset < 0) { 4343 MaxEncoding = 32; 4344 Sign = -1; 4345 Offset = -Offset; 4346 } 4347 break; 4348 default: 4349 llvm_unreachable("Unsupported opcode"); 4350 } 4351 4352 // `Offset` can be in bytes or in "scalable bytes". 4353 int VScale = 1; 4354 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 4355 VScale = 16; 4356 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 4357 VScale = 2; 4358 4359 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 4360 // scratch register. If DestReg is a virtual register, use it as the 4361 // scratch register; otherwise, create a new virtual register (to be 4362 // replaced by the scavenger at the end of PEI). That case can be optimized 4363 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 4364 // register can be loaded with offset%8 and the add/sub can use an extending 4365 // instruction with LSL#3. 4366 // Currently the function handles any offsets but generates a poor sequence 4367 // of code. 4368 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 4369 4370 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 4371 Register TmpReg = DestReg; 4372 if (TmpReg == AArch64::XZR) 4373 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 4374 &AArch64::GPR64RegClass); 4375 do { 4376 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 4377 unsigned LocalShiftSize = 0; 4378 if (ThisVal > MaxEncoding) { 4379 ThisVal = ThisVal >> ShiftSize; 4380 LocalShiftSize = ShiftSize; 4381 } 4382 assert((ThisVal >> ShiftSize) <= MaxEncoding && 4383 "Encoding cannot handle value that big"); 4384 4385 Offset -= ThisVal << LocalShiftSize; 4386 if (Offset == 0) 4387 TmpReg = DestReg; 4388 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 4389 .addReg(SrcReg) 4390 .addImm(Sign * (int)ThisVal); 4391 if (ShiftSize) 4392 MBI = MBI.addImm( 4393 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4394 MBI = MBI.setMIFlag(Flag); 4395 4396 auto Change = 4397 VScale == 1 4398 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 4399 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 4400 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 4401 CFAOffset += Change; 4402 else 4403 CFAOffset -= Change; 4404 if (EmitCFAOffset && DestReg == TmpReg) { 4405 MachineFunction &MF = *MBB.getParent(); 4406 const TargetSubtargetInfo &STI = MF.getSubtarget(); 4407 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 4408 4409 unsigned CFIIndex = MF.addFrameInst( 4410 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 4411 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 4412 .addCFIIndex(CFIIndex) 4413 .setMIFlags(Flag); 4414 } 4415 4416 if (NeedsWinCFI) { 4417 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4418 int Imm = (int)(ThisVal << LocalShiftSize); 4419 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4420 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4421 if (HasWinCFI) 4422 *HasWinCFI = true; 4423 if (Imm == 0) 4424 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4425 else 4426 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4427 .addImm(Imm) 4428 .setMIFlag(Flag); 4429 assert(Offset == 0 && "Expected remaining offset to be zero to " 4430 "emit a single SEH directive"); 4431 } else if (DestReg == AArch64::SP) { 4432 if (HasWinCFI) 4433 *HasWinCFI = true; 4434 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4435 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4436 .addImm(Imm) 4437 .setMIFlag(Flag); 4438 } 4439 } 4440 4441 SrcReg = TmpReg; 4442 } while (Offset); 4443 } 4444 4445 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4446 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4447 unsigned DestReg, unsigned SrcReg, 4448 StackOffset Offset, const TargetInstrInfo *TII, 4449 MachineInstr::MIFlag Flag, bool SetNZCV, 4450 bool NeedsWinCFI, bool *HasWinCFI, 4451 bool EmitCFAOffset, StackOffset CFAOffset, 4452 unsigned FrameReg) { 4453 // If a function is marked as arm_locally_streaming, then the runtime value of 4454 // vscale in the prologue/epilogue is different the runtime value of vscale 4455 // in the function's body. To avoid having to consider multiple vscales, 4456 // we can use `addsvl` to allocate any scalable stack-slots, which under 4457 // most circumstances will be only locals, not callee-save slots. 4458 const Function &F = MBB.getParent()->getFunction(); 4459 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 4460 4461 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4462 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4463 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4464 4465 // First emit non-scalable frame offsets, or a simple 'mov'. 4466 if (Bytes || (!Offset && SrcReg != DestReg)) { 4467 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4468 "SP increment/decrement not 8-byte aligned"); 4469 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4470 if (Bytes < 0) { 4471 Bytes = -Bytes; 4472 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4473 } 4474 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4475 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 4476 FrameReg); 4477 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 4478 ? StackOffset::getFixed(-Bytes) 4479 : StackOffset::getFixed(Bytes); 4480 SrcReg = DestReg; 4481 FrameReg = DestReg; 4482 } 4483 4484 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4485 "SetNZCV not supported with SVE vectors"); 4486 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4487 "WinCFI not supported with SVE vectors"); 4488 4489 if (NumDataVectors) { 4490 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4491 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 4492 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4493 CFAOffset, FrameReg); 4494 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 4495 SrcReg = DestReg; 4496 } 4497 4498 if (NumPredicateVectors) { 4499 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4500 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4501 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 4502 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4503 CFAOffset, FrameReg); 4504 } 4505 } 4506 4507 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4508 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4509 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4510 LiveIntervals *LIS, VirtRegMap *VRM) const { 4511 // This is a bit of a hack. Consider this instruction: 4512 // 4513 // %0 = COPY %sp; GPR64all:%0 4514 // 4515 // We explicitly chose GPR64all for the virtual register so such a copy might 4516 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4517 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4518 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4519 // 4520 // To prevent that, we are going to constrain the %0 register class here. 4521 // 4522 // <rdar://problem/11522048> 4523 // 4524 if (MI.isFullCopy()) { 4525 Register DstReg = MI.getOperand(0).getReg(); 4526 Register SrcReg = MI.getOperand(1).getReg(); 4527 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 4528 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4529 return nullptr; 4530 } 4531 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 4532 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4533 return nullptr; 4534 } 4535 // Nothing can folded with copy from/to NZCV. 4536 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 4537 return nullptr; 4538 } 4539 4540 // Handle the case where a copy is being spilled or filled but the source 4541 // and destination register class don't match. For example: 4542 // 4543 // %0 = COPY %xzr; GPR64common:%0 4544 // 4545 // In this case we can still safely fold away the COPY and generate the 4546 // following spill code: 4547 // 4548 // STRXui %xzr, %stack.0 4549 // 4550 // This also eliminates spilled cross register class COPYs (e.g. between x and 4551 // d regs) of the same size. For example: 4552 // 4553 // %0 = COPY %1; GPR64:%0, FPR64:%1 4554 // 4555 // will be filled as 4556 // 4557 // LDRDui %0, fi<#0> 4558 // 4559 // instead of 4560 // 4561 // LDRXui %Temp, fi<#0> 4562 // %0 = FMOV %Temp 4563 // 4564 if (MI.isCopy() && Ops.size() == 1 && 4565 // Make sure we're only folding the explicit COPY defs/uses. 4566 (Ops[0] == 0 || Ops[0] == 1)) { 4567 bool IsSpill = Ops[0] == 0; 4568 bool IsFill = !IsSpill; 4569 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4570 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4571 MachineBasicBlock &MBB = *MI.getParent(); 4572 const MachineOperand &DstMO = MI.getOperand(0); 4573 const MachineOperand &SrcMO = MI.getOperand(1); 4574 Register DstReg = DstMO.getReg(); 4575 Register SrcReg = SrcMO.getReg(); 4576 // This is slightly expensive to compute for physical regs since 4577 // getMinimalPhysRegClass is slow. 4578 auto getRegClass = [&](unsigned Reg) { 4579 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4580 : TRI.getMinimalPhysRegClass(Reg); 4581 }; 4582 4583 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4584 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4585 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4586 "Mismatched register size in non subreg COPY"); 4587 if (IsSpill) 4588 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4589 getRegClass(SrcReg), &TRI, Register()); 4590 else 4591 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4592 getRegClass(DstReg), &TRI, Register()); 4593 return &*--InsertPt; 4594 } 4595 4596 // Handle cases like spilling def of: 4597 // 4598 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4599 // 4600 // where the physical register source can be widened and stored to the full 4601 // virtual reg destination stack slot, in this case producing: 4602 // 4603 // STRXui %xzr, %stack.0 4604 // 4605 if (IsSpill && DstMO.isUndef() && SrcReg.isPhysical()) { 4606 assert(SrcMO.getSubReg() == 0 && 4607 "Unexpected subreg on physical register"); 4608 const TargetRegisterClass *SpillRC; 4609 unsigned SpillSubreg; 4610 switch (DstMO.getSubReg()) { 4611 default: 4612 SpillRC = nullptr; 4613 break; 4614 case AArch64::sub_32: 4615 case AArch64::ssub: 4616 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4617 SpillRC = &AArch64::GPR64RegClass; 4618 SpillSubreg = AArch64::sub_32; 4619 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4620 SpillRC = &AArch64::FPR64RegClass; 4621 SpillSubreg = AArch64::ssub; 4622 } else 4623 SpillRC = nullptr; 4624 break; 4625 case AArch64::dsub: 4626 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4627 SpillRC = &AArch64::FPR128RegClass; 4628 SpillSubreg = AArch64::dsub; 4629 } else 4630 SpillRC = nullptr; 4631 break; 4632 } 4633 4634 if (SpillRC) 4635 if (unsigned WidenedSrcReg = 4636 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4637 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4638 FrameIndex, SpillRC, &TRI, Register()); 4639 return &*--InsertPt; 4640 } 4641 } 4642 4643 // Handle cases like filling use of: 4644 // 4645 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4646 // 4647 // where we can load the full virtual reg source stack slot, into the subreg 4648 // destination, in this case producing: 4649 // 4650 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4651 // 4652 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4653 const TargetRegisterClass *FillRC; 4654 switch (DstMO.getSubReg()) { 4655 default: 4656 FillRC = nullptr; 4657 break; 4658 case AArch64::sub_32: 4659 FillRC = &AArch64::GPR32RegClass; 4660 break; 4661 case AArch64::ssub: 4662 FillRC = &AArch64::FPR32RegClass; 4663 break; 4664 case AArch64::dsub: 4665 FillRC = &AArch64::FPR64RegClass; 4666 break; 4667 } 4668 4669 if (FillRC) { 4670 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4671 TRI.getRegSizeInBits(*FillRC) && 4672 "Mismatched regclass size on folded subreg COPY"); 4673 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 4674 Register()); 4675 MachineInstr &LoadMI = *--InsertPt; 4676 MachineOperand &LoadDst = LoadMI.getOperand(0); 4677 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4678 LoadDst.setSubReg(DstMO.getSubReg()); 4679 LoadDst.setIsUndef(); 4680 return &LoadMI; 4681 } 4682 } 4683 } 4684 4685 // Cannot fold. 4686 return nullptr; 4687 } 4688 4689 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4690 StackOffset &SOffset, 4691 bool *OutUseUnscaledOp, 4692 unsigned *OutUnscaledOp, 4693 int64_t *EmittableOffset) { 4694 // Set output values in case of early exit. 4695 if (EmittableOffset) 4696 *EmittableOffset = 0; 4697 if (OutUseUnscaledOp) 4698 *OutUseUnscaledOp = false; 4699 if (OutUnscaledOp) 4700 *OutUnscaledOp = 0; 4701 4702 // Exit early for structured vector spills/fills as they can't take an 4703 // immediate offset. 4704 switch (MI.getOpcode()) { 4705 default: 4706 break; 4707 case AArch64::LD1Twov2d: 4708 case AArch64::LD1Threev2d: 4709 case AArch64::LD1Fourv2d: 4710 case AArch64::LD1Twov1d: 4711 case AArch64::LD1Threev1d: 4712 case AArch64::LD1Fourv1d: 4713 case AArch64::ST1Twov2d: 4714 case AArch64::ST1Threev2d: 4715 case AArch64::ST1Fourv2d: 4716 case AArch64::ST1Twov1d: 4717 case AArch64::ST1Threev1d: 4718 case AArch64::ST1Fourv1d: 4719 case AArch64::ST1i8: 4720 case AArch64::ST1i16: 4721 case AArch64::ST1i32: 4722 case AArch64::ST1i64: 4723 case AArch64::IRG: 4724 case AArch64::IRGstack: 4725 case AArch64::STGloop: 4726 case AArch64::STZGloop: 4727 return AArch64FrameOffsetCannotUpdate; 4728 } 4729 4730 // Get the min/max offset and the scale. 4731 TypeSize ScaleValue(0U, false); 4732 unsigned Width; 4733 int64_t MinOff, MaxOff; 4734 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4735 MaxOff)) 4736 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4737 4738 // Construct the complete offset. 4739 bool IsMulVL = ScaleValue.isScalable(); 4740 unsigned Scale = ScaleValue.getKnownMinValue(); 4741 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4742 4743 const MachineOperand &ImmOpnd = 4744 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4745 Offset += ImmOpnd.getImm() * Scale; 4746 4747 // If the offset doesn't match the scale, we rewrite the instruction to 4748 // use the unscaled instruction instead. Likewise, if we have a negative 4749 // offset and there is an unscaled op to use. 4750 std::optional<unsigned> UnscaledOp = 4751 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4752 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4753 if (useUnscaledOp && 4754 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4755 MaxOff)) 4756 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4757 4758 Scale = ScaleValue.getKnownMinValue(); 4759 assert(IsMulVL == ScaleValue.isScalable() && 4760 "Unscaled opcode has different value for scalable"); 4761 4762 int64_t Remainder = Offset % Scale; 4763 assert(!(Remainder && useUnscaledOp) && 4764 "Cannot have remainder when using unscaled op"); 4765 4766 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4767 int64_t NewOffset = Offset / Scale; 4768 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4769 Offset = Remainder; 4770 else { 4771 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4772 Offset = Offset - NewOffset * Scale + Remainder; 4773 } 4774 4775 if (EmittableOffset) 4776 *EmittableOffset = NewOffset; 4777 if (OutUseUnscaledOp) 4778 *OutUseUnscaledOp = useUnscaledOp; 4779 if (OutUnscaledOp && UnscaledOp) 4780 *OutUnscaledOp = *UnscaledOp; 4781 4782 if (IsMulVL) 4783 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4784 else 4785 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4786 return AArch64FrameOffsetCanUpdate | 4787 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4788 } 4789 4790 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4791 unsigned FrameReg, StackOffset &Offset, 4792 const AArch64InstrInfo *TII) { 4793 unsigned Opcode = MI.getOpcode(); 4794 unsigned ImmIdx = FrameRegIdx + 1; 4795 4796 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4797 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4798 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4799 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4800 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4801 MI.eraseFromParent(); 4802 Offset = StackOffset(); 4803 return true; 4804 } 4805 4806 int64_t NewOffset; 4807 unsigned UnscaledOp; 4808 bool UseUnscaledOp; 4809 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4810 &UnscaledOp, &NewOffset); 4811 if (Status & AArch64FrameOffsetCanUpdate) { 4812 if (Status & AArch64FrameOffsetIsLegal) 4813 // Replace the FrameIndex with FrameReg. 4814 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4815 if (UseUnscaledOp) 4816 MI.setDesc(TII->get(UnscaledOp)); 4817 4818 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4819 return !Offset; 4820 } 4821 4822 return false; 4823 } 4824 4825 MCInst AArch64InstrInfo::getNop() const { 4826 return MCInstBuilder(AArch64::HINT).addImm(0); 4827 } 4828 4829 // AArch64 supports MachineCombiner. 4830 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4831 4832 // True when Opc sets flag 4833 static bool isCombineInstrSettingFlag(unsigned Opc) { 4834 switch (Opc) { 4835 case AArch64::ADDSWrr: 4836 case AArch64::ADDSWri: 4837 case AArch64::ADDSXrr: 4838 case AArch64::ADDSXri: 4839 case AArch64::SUBSWrr: 4840 case AArch64::SUBSXrr: 4841 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4842 case AArch64::SUBSWri: 4843 case AArch64::SUBSXri: 4844 return true; 4845 default: 4846 break; 4847 } 4848 return false; 4849 } 4850 4851 // 32b Opcodes that can be combined with a MUL 4852 static bool isCombineInstrCandidate32(unsigned Opc) { 4853 switch (Opc) { 4854 case AArch64::ADDWrr: 4855 case AArch64::ADDWri: 4856 case AArch64::SUBWrr: 4857 case AArch64::ADDSWrr: 4858 case AArch64::ADDSWri: 4859 case AArch64::SUBSWrr: 4860 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4861 case AArch64::SUBWri: 4862 case AArch64::SUBSWri: 4863 return true; 4864 default: 4865 break; 4866 } 4867 return false; 4868 } 4869 4870 // 64b Opcodes that can be combined with a MUL 4871 static bool isCombineInstrCandidate64(unsigned Opc) { 4872 switch (Opc) { 4873 case AArch64::ADDXrr: 4874 case AArch64::ADDXri: 4875 case AArch64::SUBXrr: 4876 case AArch64::ADDSXrr: 4877 case AArch64::ADDSXri: 4878 case AArch64::SUBSXrr: 4879 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4880 case AArch64::SUBXri: 4881 case AArch64::SUBSXri: 4882 case AArch64::ADDv8i8: 4883 case AArch64::ADDv16i8: 4884 case AArch64::ADDv4i16: 4885 case AArch64::ADDv8i16: 4886 case AArch64::ADDv2i32: 4887 case AArch64::ADDv4i32: 4888 case AArch64::SUBv8i8: 4889 case AArch64::SUBv16i8: 4890 case AArch64::SUBv4i16: 4891 case AArch64::SUBv8i16: 4892 case AArch64::SUBv2i32: 4893 case AArch64::SUBv4i32: 4894 return true; 4895 default: 4896 break; 4897 } 4898 return false; 4899 } 4900 4901 // FP Opcodes that can be combined with a FMUL. 4902 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4903 switch (Inst.getOpcode()) { 4904 default: 4905 break; 4906 case AArch64::FADDHrr: 4907 case AArch64::FADDSrr: 4908 case AArch64::FADDDrr: 4909 case AArch64::FADDv4f16: 4910 case AArch64::FADDv8f16: 4911 case AArch64::FADDv2f32: 4912 case AArch64::FADDv2f64: 4913 case AArch64::FADDv4f32: 4914 case AArch64::FSUBHrr: 4915 case AArch64::FSUBSrr: 4916 case AArch64::FSUBDrr: 4917 case AArch64::FSUBv4f16: 4918 case AArch64::FSUBv8f16: 4919 case AArch64::FSUBv2f32: 4920 case AArch64::FSUBv2f64: 4921 case AArch64::FSUBv4f32: 4922 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4923 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4924 // the target options or if FADD/FSUB has the contract fast-math flag. 4925 return Options.UnsafeFPMath || 4926 Options.AllowFPOpFusion == FPOpFusion::Fast || 4927 Inst.getFlag(MachineInstr::FmContract); 4928 return true; 4929 } 4930 return false; 4931 } 4932 4933 // Opcodes that can be combined with a MUL 4934 static bool isCombineInstrCandidate(unsigned Opc) { 4935 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4936 } 4937 4938 // 4939 // Utility routine that checks if \param MO is defined by an 4940 // \param CombineOpc instruction in the basic block \param MBB 4941 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4942 unsigned CombineOpc, unsigned ZeroReg = 0, 4943 bool CheckZeroReg = false) { 4944 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4945 MachineInstr *MI = nullptr; 4946 4947 if (MO.isReg() && MO.getReg().isVirtual()) 4948 MI = MRI.getUniqueVRegDef(MO.getReg()); 4949 // And it needs to be in the trace (otherwise, it won't have a depth). 4950 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4951 return false; 4952 // Must only used by the user we combine with. 4953 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4954 return false; 4955 4956 if (CheckZeroReg) { 4957 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4958 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4959 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4960 // The third input reg must be zero. 4961 if (MI->getOperand(3).getReg() != ZeroReg) 4962 return false; 4963 } 4964 4965 if (isCombineInstrSettingFlag(CombineOpc) && 4966 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 4967 return false; 4968 4969 return true; 4970 } 4971 4972 // 4973 // Is \param MO defined by an integer multiply and can be combined? 4974 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4975 unsigned MulOpc, unsigned ZeroReg) { 4976 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4977 } 4978 4979 // 4980 // Is \param MO defined by a floating-point multiply and can be combined? 4981 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4982 unsigned MulOpc) { 4983 return canCombine(MBB, MO, MulOpc); 4984 } 4985 4986 // TODO: There are many more machine instruction opcodes to match: 4987 // 1. Other data types (integer, vectors) 4988 // 2. Other math / logic operations (xor, or) 4989 // 3. Other forms of the same operation (intrinsics and other variants) 4990 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 4991 bool Invert) const { 4992 if (Invert) 4993 return false; 4994 switch (Inst.getOpcode()) { 4995 // == Floating-point types == 4996 // -- Floating-point instructions -- 4997 case AArch64::FADDHrr: 4998 case AArch64::FADDSrr: 4999 case AArch64::FADDDrr: 5000 case AArch64::FMULHrr: 5001 case AArch64::FMULSrr: 5002 case AArch64::FMULDrr: 5003 case AArch64::FMULX16: 5004 case AArch64::FMULX32: 5005 case AArch64::FMULX64: 5006 // -- Advanced SIMD instructions -- 5007 case AArch64::FADDv4f16: 5008 case AArch64::FADDv8f16: 5009 case AArch64::FADDv2f32: 5010 case AArch64::FADDv4f32: 5011 case AArch64::FADDv2f64: 5012 case AArch64::FMULv4f16: 5013 case AArch64::FMULv8f16: 5014 case AArch64::FMULv2f32: 5015 case AArch64::FMULv4f32: 5016 case AArch64::FMULv2f64: 5017 case AArch64::FMULXv4f16: 5018 case AArch64::FMULXv8f16: 5019 case AArch64::FMULXv2f32: 5020 case AArch64::FMULXv4f32: 5021 case AArch64::FMULXv2f64: 5022 // -- SVE instructions -- 5023 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 5024 // in the SVE instruction set (though there are predicated ones). 5025 case AArch64::FADD_ZZZ_H: 5026 case AArch64::FADD_ZZZ_S: 5027 case AArch64::FADD_ZZZ_D: 5028 case AArch64::FMUL_ZZZ_H: 5029 case AArch64::FMUL_ZZZ_S: 5030 case AArch64::FMUL_ZZZ_D: 5031 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 5032 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 5033 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 5034 5035 // == Integer types == 5036 // -- Base instructions -- 5037 // Opcodes MULWrr and MULXrr don't exist because 5038 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 5039 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 5040 // The machine-combiner does not support three-source-operands machine 5041 // instruction. So we cannot reassociate MULs. 5042 case AArch64::ADDWrr: 5043 case AArch64::ADDXrr: 5044 case AArch64::ANDWrr: 5045 case AArch64::ANDXrr: 5046 case AArch64::ORRWrr: 5047 case AArch64::ORRXrr: 5048 case AArch64::EORWrr: 5049 case AArch64::EORXrr: 5050 case AArch64::EONWrr: 5051 case AArch64::EONXrr: 5052 // -- Advanced SIMD instructions -- 5053 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 5054 // in the Advanced SIMD instruction set. 5055 case AArch64::ADDv8i8: 5056 case AArch64::ADDv16i8: 5057 case AArch64::ADDv4i16: 5058 case AArch64::ADDv8i16: 5059 case AArch64::ADDv2i32: 5060 case AArch64::ADDv4i32: 5061 case AArch64::ADDv1i64: 5062 case AArch64::ADDv2i64: 5063 case AArch64::MULv8i8: 5064 case AArch64::MULv16i8: 5065 case AArch64::MULv4i16: 5066 case AArch64::MULv8i16: 5067 case AArch64::MULv2i32: 5068 case AArch64::MULv4i32: 5069 case AArch64::ANDv8i8: 5070 case AArch64::ANDv16i8: 5071 case AArch64::ORRv8i8: 5072 case AArch64::ORRv16i8: 5073 case AArch64::EORv8i8: 5074 case AArch64::EORv16i8: 5075 // -- SVE instructions -- 5076 case AArch64::ADD_ZZZ_B: 5077 case AArch64::ADD_ZZZ_H: 5078 case AArch64::ADD_ZZZ_S: 5079 case AArch64::ADD_ZZZ_D: 5080 case AArch64::MUL_ZZZ_B: 5081 case AArch64::MUL_ZZZ_H: 5082 case AArch64::MUL_ZZZ_S: 5083 case AArch64::MUL_ZZZ_D: 5084 case AArch64::AND_ZZZ: 5085 case AArch64::ORR_ZZZ: 5086 case AArch64::EOR_ZZZ: 5087 return true; 5088 5089 default: 5090 return false; 5091 } 5092 } 5093 5094 /// Find instructions that can be turned into madd. 5095 static bool getMaddPatterns(MachineInstr &Root, 5096 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5097 unsigned Opc = Root.getOpcode(); 5098 MachineBasicBlock &MBB = *Root.getParent(); 5099 bool Found = false; 5100 5101 if (!isCombineInstrCandidate(Opc)) 5102 return false; 5103 if (isCombineInstrSettingFlag(Opc)) { 5104 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 5105 // When NZCV is live bail out. 5106 if (Cmp_NZCV == -1) 5107 return false; 5108 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 5109 // When opcode can't change bail out. 5110 // CHECKME: do we miss any cases for opcode conversion? 5111 if (NewOpc == Opc) 5112 return false; 5113 Opc = NewOpc; 5114 } 5115 5116 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 5117 MachineCombinerPattern Pattern) { 5118 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 5119 Patterns.push_back(Pattern); 5120 Found = true; 5121 } 5122 }; 5123 5124 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 5125 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 5126 Patterns.push_back(Pattern); 5127 Found = true; 5128 } 5129 }; 5130 5131 typedef MachineCombinerPattern MCP; 5132 5133 switch (Opc) { 5134 default: 5135 break; 5136 case AArch64::ADDWrr: 5137 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5138 "ADDWrr does not have register operands"); 5139 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 5140 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 5141 break; 5142 case AArch64::ADDXrr: 5143 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 5144 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 5145 break; 5146 case AArch64::SUBWrr: 5147 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 5148 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 5149 break; 5150 case AArch64::SUBXrr: 5151 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 5152 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 5153 break; 5154 case AArch64::ADDWri: 5155 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 5156 break; 5157 case AArch64::ADDXri: 5158 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 5159 break; 5160 case AArch64::SUBWri: 5161 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 5162 break; 5163 case AArch64::SUBXri: 5164 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 5165 break; 5166 case AArch64::ADDv8i8: 5167 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 5168 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 5169 break; 5170 case AArch64::ADDv16i8: 5171 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 5172 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 5173 break; 5174 case AArch64::ADDv4i16: 5175 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 5176 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 5177 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 5178 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 5179 break; 5180 case AArch64::ADDv8i16: 5181 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 5182 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 5183 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 5184 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 5185 break; 5186 case AArch64::ADDv2i32: 5187 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 5188 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 5189 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 5190 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 5191 break; 5192 case AArch64::ADDv4i32: 5193 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 5194 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 5195 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 5196 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 5197 break; 5198 case AArch64::SUBv8i8: 5199 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 5200 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 5201 break; 5202 case AArch64::SUBv16i8: 5203 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 5204 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 5205 break; 5206 case AArch64::SUBv4i16: 5207 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 5208 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 5209 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 5210 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 5211 break; 5212 case AArch64::SUBv8i16: 5213 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 5214 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 5215 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 5216 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 5217 break; 5218 case AArch64::SUBv2i32: 5219 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 5220 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 5221 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 5222 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 5223 break; 5224 case AArch64::SUBv4i32: 5225 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 5226 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 5227 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 5228 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 5229 break; 5230 } 5231 return Found; 5232 } 5233 /// Floating-Point Support 5234 5235 /// Find instructions that can be turned into madd. 5236 static bool getFMAPatterns(MachineInstr &Root, 5237 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5238 5239 if (!isCombineInstrCandidateFP(Root)) 5240 return false; 5241 5242 MachineBasicBlock &MBB = *Root.getParent(); 5243 bool Found = false; 5244 5245 auto Match = [&](int Opcode, int Operand, 5246 MachineCombinerPattern Pattern) -> bool { 5247 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 5248 Patterns.push_back(Pattern); 5249 return true; 5250 } 5251 return false; 5252 }; 5253 5254 typedef MachineCombinerPattern MCP; 5255 5256 switch (Root.getOpcode()) { 5257 default: 5258 assert(false && "Unsupported FP instruction in combiner\n"); 5259 break; 5260 case AArch64::FADDHrr: 5261 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5262 "FADDHrr does not have register operands"); 5263 5264 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 5265 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 5266 break; 5267 case AArch64::FADDSrr: 5268 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5269 "FADDSrr does not have register operands"); 5270 5271 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 5272 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 5273 5274 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 5275 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 5276 break; 5277 case AArch64::FADDDrr: 5278 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 5279 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 5280 5281 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 5282 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 5283 break; 5284 case AArch64::FADDv4f16: 5285 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 5286 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 5287 5288 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 5289 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 5290 break; 5291 case AArch64::FADDv8f16: 5292 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 5293 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 5294 5295 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 5296 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 5297 break; 5298 case AArch64::FADDv2f32: 5299 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 5300 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 5301 5302 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 5303 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 5304 break; 5305 case AArch64::FADDv2f64: 5306 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 5307 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 5308 5309 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 5310 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 5311 break; 5312 case AArch64::FADDv4f32: 5313 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 5314 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 5315 5316 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 5317 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 5318 break; 5319 case AArch64::FSUBHrr: 5320 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 5321 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 5322 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 5323 break; 5324 case AArch64::FSUBSrr: 5325 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 5326 5327 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 5328 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 5329 5330 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 5331 break; 5332 case AArch64::FSUBDrr: 5333 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 5334 5335 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 5336 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 5337 5338 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 5339 break; 5340 case AArch64::FSUBv4f16: 5341 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 5342 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 5343 5344 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 5345 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 5346 break; 5347 case AArch64::FSUBv8f16: 5348 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 5349 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 5350 5351 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 5352 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 5353 break; 5354 case AArch64::FSUBv2f32: 5355 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 5356 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 5357 5358 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 5359 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 5360 break; 5361 case AArch64::FSUBv2f64: 5362 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 5363 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 5364 5365 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 5366 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 5367 break; 5368 case AArch64::FSUBv4f32: 5369 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 5370 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 5371 5372 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 5373 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 5374 break; 5375 } 5376 return Found; 5377 } 5378 5379 static bool getFMULPatterns(MachineInstr &Root, 5380 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5381 MachineBasicBlock &MBB = *Root.getParent(); 5382 bool Found = false; 5383 5384 auto Match = [&](unsigned Opcode, int Operand, 5385 MachineCombinerPattern Pattern) -> bool { 5386 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5387 MachineOperand &MO = Root.getOperand(Operand); 5388 MachineInstr *MI = nullptr; 5389 if (MO.isReg() && MO.getReg().isVirtual()) 5390 MI = MRI.getUniqueVRegDef(MO.getReg()); 5391 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 5392 if (MI && MI->getOpcode() == TargetOpcode::COPY && 5393 MI->getOperand(1).getReg().isVirtual()) 5394 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 5395 if (MI && MI->getOpcode() == Opcode) { 5396 Patterns.push_back(Pattern); 5397 return true; 5398 } 5399 return false; 5400 }; 5401 5402 typedef MachineCombinerPattern MCP; 5403 5404 switch (Root.getOpcode()) { 5405 default: 5406 return false; 5407 case AArch64::FMULv2f32: 5408 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 5409 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 5410 break; 5411 case AArch64::FMULv2f64: 5412 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 5413 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 5414 break; 5415 case AArch64::FMULv4f16: 5416 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 5417 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 5418 break; 5419 case AArch64::FMULv4f32: 5420 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 5421 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 5422 break; 5423 case AArch64::FMULv8f16: 5424 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 5425 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 5426 break; 5427 } 5428 5429 return Found; 5430 } 5431 5432 static bool getFNEGPatterns(MachineInstr &Root, 5433 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5434 unsigned Opc = Root.getOpcode(); 5435 MachineBasicBlock &MBB = *Root.getParent(); 5436 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5437 5438 auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool { 5439 MachineOperand &MO = Root.getOperand(1); 5440 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 5441 if (MI != nullptr && MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 5442 (MI->getOpcode() == Opcode) && 5443 Root.getFlag(MachineInstr::MIFlag::FmContract) && 5444 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 5445 MI->getFlag(MachineInstr::MIFlag::FmContract) && 5446 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 5447 Patterns.push_back(Pattern); 5448 return true; 5449 } 5450 return false; 5451 }; 5452 5453 switch (Opc) { 5454 default: 5455 break; 5456 case AArch64::FNEGDr: 5457 return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD); 5458 case AArch64::FNEGSr: 5459 return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD); 5460 } 5461 5462 return false; 5463 } 5464 5465 /// Return true when a code sequence can improve throughput. It 5466 /// should be called only for instructions in loops. 5467 /// \param Pattern - combiner pattern 5468 bool AArch64InstrInfo::isThroughputPattern( 5469 MachineCombinerPattern Pattern) const { 5470 switch (Pattern) { 5471 default: 5472 break; 5473 case MachineCombinerPattern::FMULADDH_OP1: 5474 case MachineCombinerPattern::FMULADDH_OP2: 5475 case MachineCombinerPattern::FMULSUBH_OP1: 5476 case MachineCombinerPattern::FMULSUBH_OP2: 5477 case MachineCombinerPattern::FMULADDS_OP1: 5478 case MachineCombinerPattern::FMULADDS_OP2: 5479 case MachineCombinerPattern::FMULSUBS_OP1: 5480 case MachineCombinerPattern::FMULSUBS_OP2: 5481 case MachineCombinerPattern::FMULADDD_OP1: 5482 case MachineCombinerPattern::FMULADDD_OP2: 5483 case MachineCombinerPattern::FMULSUBD_OP1: 5484 case MachineCombinerPattern::FMULSUBD_OP2: 5485 case MachineCombinerPattern::FNMULSUBH_OP1: 5486 case MachineCombinerPattern::FNMULSUBS_OP1: 5487 case MachineCombinerPattern::FNMULSUBD_OP1: 5488 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5489 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5490 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5491 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5492 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5493 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5494 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5495 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5496 case MachineCombinerPattern::FMLAv4f16_OP2: 5497 case MachineCombinerPattern::FMLAv4f16_OP1: 5498 case MachineCombinerPattern::FMLAv8f16_OP1: 5499 case MachineCombinerPattern::FMLAv8f16_OP2: 5500 case MachineCombinerPattern::FMLAv2f32_OP2: 5501 case MachineCombinerPattern::FMLAv2f32_OP1: 5502 case MachineCombinerPattern::FMLAv2f64_OP1: 5503 case MachineCombinerPattern::FMLAv2f64_OP2: 5504 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5505 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5506 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5507 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5508 case MachineCombinerPattern::FMLAv4f32_OP1: 5509 case MachineCombinerPattern::FMLAv4f32_OP2: 5510 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5511 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5512 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 5513 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5514 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 5515 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5516 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5517 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5518 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5519 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5520 case MachineCombinerPattern::FMLSv4f16_OP1: 5521 case MachineCombinerPattern::FMLSv4f16_OP2: 5522 case MachineCombinerPattern::FMLSv8f16_OP1: 5523 case MachineCombinerPattern::FMLSv8f16_OP2: 5524 case MachineCombinerPattern::FMLSv2f32_OP2: 5525 case MachineCombinerPattern::FMLSv2f64_OP2: 5526 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5527 case MachineCombinerPattern::FMLSv4f32_OP2: 5528 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 5529 case MachineCombinerPattern::FMULv2i32_indexed_OP2: 5530 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 5531 case MachineCombinerPattern::FMULv2i64_indexed_OP2: 5532 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 5533 case MachineCombinerPattern::FMULv4i16_indexed_OP2: 5534 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 5535 case MachineCombinerPattern::FMULv4i32_indexed_OP2: 5536 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 5537 case MachineCombinerPattern::FMULv8i16_indexed_OP2: 5538 case MachineCombinerPattern::MULADDv8i8_OP1: 5539 case MachineCombinerPattern::MULADDv8i8_OP2: 5540 case MachineCombinerPattern::MULADDv16i8_OP1: 5541 case MachineCombinerPattern::MULADDv16i8_OP2: 5542 case MachineCombinerPattern::MULADDv4i16_OP1: 5543 case MachineCombinerPattern::MULADDv4i16_OP2: 5544 case MachineCombinerPattern::MULADDv8i16_OP1: 5545 case MachineCombinerPattern::MULADDv8i16_OP2: 5546 case MachineCombinerPattern::MULADDv2i32_OP1: 5547 case MachineCombinerPattern::MULADDv2i32_OP2: 5548 case MachineCombinerPattern::MULADDv4i32_OP1: 5549 case MachineCombinerPattern::MULADDv4i32_OP2: 5550 case MachineCombinerPattern::MULSUBv8i8_OP1: 5551 case MachineCombinerPattern::MULSUBv8i8_OP2: 5552 case MachineCombinerPattern::MULSUBv16i8_OP1: 5553 case MachineCombinerPattern::MULSUBv16i8_OP2: 5554 case MachineCombinerPattern::MULSUBv4i16_OP1: 5555 case MachineCombinerPattern::MULSUBv4i16_OP2: 5556 case MachineCombinerPattern::MULSUBv8i16_OP1: 5557 case MachineCombinerPattern::MULSUBv8i16_OP2: 5558 case MachineCombinerPattern::MULSUBv2i32_OP1: 5559 case MachineCombinerPattern::MULSUBv2i32_OP2: 5560 case MachineCombinerPattern::MULSUBv4i32_OP1: 5561 case MachineCombinerPattern::MULSUBv4i32_OP2: 5562 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5563 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5564 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5565 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5566 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5567 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5568 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5569 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5570 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5571 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5572 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5573 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5574 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5575 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5576 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5577 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5578 return true; 5579 } // end switch (Pattern) 5580 return false; 5581 } 5582 5583 /// Find other MI combine patterns. 5584 static bool getMiscPatterns(MachineInstr &Root, 5585 SmallVectorImpl<MachineCombinerPattern> &Patterns) 5586 { 5587 // A - (B + C) ==> (A - B) - C or (A - C) - B 5588 unsigned Opc = Root.getOpcode(); 5589 MachineBasicBlock &MBB = *Root.getParent(); 5590 5591 switch (Opc) { 5592 case AArch64::SUBWrr: 5593 case AArch64::SUBSWrr: 5594 case AArch64::SUBXrr: 5595 case AArch64::SUBSXrr: 5596 // Found candidate root. 5597 break; 5598 default: 5599 return false; 5600 } 5601 5602 if (isCombineInstrSettingFlag(Opc) && 5603 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 5604 return false; 5605 5606 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 5607 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 5608 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 5609 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 5610 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); 5611 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); 5612 return true; 5613 } 5614 5615 return false; 5616 } 5617 5618 /// Return true when there is potentially a faster code sequence for an 5619 /// instruction chain ending in \p Root. All potential patterns are listed in 5620 /// the \p Pattern vector. Pattern should be sorted in priority order since the 5621 /// pattern evaluator stops checking as soon as it finds a faster sequence. 5622 5623 bool AArch64InstrInfo::getMachineCombinerPatterns( 5624 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 5625 bool DoRegPressureReduce) const { 5626 // Integer patterns 5627 if (getMaddPatterns(Root, Patterns)) 5628 return true; 5629 // Floating point patterns 5630 if (getFMULPatterns(Root, Patterns)) 5631 return true; 5632 if (getFMAPatterns(Root, Patterns)) 5633 return true; 5634 if (getFNEGPatterns(Root, Patterns)) 5635 return true; 5636 5637 // Other patterns 5638 if (getMiscPatterns(Root, Patterns)) 5639 return true; 5640 5641 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 5642 DoRegPressureReduce); 5643 } 5644 5645 enum class FMAInstKind { Default, Indexed, Accumulator }; 5646 /// genFusedMultiply - Generate fused multiply instructions. 5647 /// This function supports both integer and floating point instructions. 5648 /// A typical example: 5649 /// F|MUL I=A,B,0 5650 /// F|ADD R,I,C 5651 /// ==> F|MADD R,A,B,C 5652 /// \param MF Containing MachineFunction 5653 /// \param MRI Register information 5654 /// \param TII Target information 5655 /// \param Root is the F|ADD instruction 5656 /// \param [out] InsInstrs is a vector of machine instructions and will 5657 /// contain the generated madd instruction 5658 /// \param IdxMulOpd is index of operand in Root that is the result of 5659 /// the F|MUL. In the example above IdxMulOpd is 1. 5660 /// \param MaddOpc the opcode fo the f|madd instruction 5661 /// \param RC Register class of operands 5662 /// \param kind of fma instruction (addressing mode) to be generated 5663 /// \param ReplacedAddend is the result register from the instruction 5664 /// replacing the non-combined operand, if any. 5665 static MachineInstr * 5666 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5667 const TargetInstrInfo *TII, MachineInstr &Root, 5668 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5669 unsigned MaddOpc, const TargetRegisterClass *RC, 5670 FMAInstKind kind = FMAInstKind::Default, 5671 const Register *ReplacedAddend = nullptr) { 5672 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5673 5674 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5675 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5676 Register ResultReg = Root.getOperand(0).getReg(); 5677 Register SrcReg0 = MUL->getOperand(1).getReg(); 5678 bool Src0IsKill = MUL->getOperand(1).isKill(); 5679 Register SrcReg1 = MUL->getOperand(2).getReg(); 5680 bool Src1IsKill = MUL->getOperand(2).isKill(); 5681 5682 Register SrcReg2; 5683 bool Src2IsKill; 5684 if (ReplacedAddend) { 5685 // If we just generated a new addend, we must be it's only use. 5686 SrcReg2 = *ReplacedAddend; 5687 Src2IsKill = true; 5688 } else { 5689 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5690 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5691 } 5692 5693 if (ResultReg.isVirtual()) 5694 MRI.constrainRegClass(ResultReg, RC); 5695 if (SrcReg0.isVirtual()) 5696 MRI.constrainRegClass(SrcReg0, RC); 5697 if (SrcReg1.isVirtual()) 5698 MRI.constrainRegClass(SrcReg1, RC); 5699 if (SrcReg2.isVirtual()) 5700 MRI.constrainRegClass(SrcReg2, RC); 5701 5702 MachineInstrBuilder MIB; 5703 if (kind == FMAInstKind::Default) 5704 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5705 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5706 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5707 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5708 else if (kind == FMAInstKind::Indexed) 5709 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5710 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5711 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5712 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5713 .addImm(MUL->getOperand(3).getImm()); 5714 else if (kind == FMAInstKind::Accumulator) 5715 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5716 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5717 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5718 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5719 else 5720 assert(false && "Invalid FMA instruction kind \n"); 5721 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5722 InsInstrs.push_back(MIB); 5723 return MUL; 5724 } 5725 5726 static MachineInstr * 5727 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 5728 const TargetInstrInfo *TII, MachineInstr &Root, 5729 SmallVectorImpl<MachineInstr *> &InsInstrs) { 5730 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 5731 5732 unsigned Opc = 0; 5733 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 5734 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5735 Opc = AArch64::FNMADDSrrr; 5736 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 5737 Opc = AArch64::FNMADDDrrr; 5738 else 5739 return nullptr; 5740 5741 Register ResultReg = Root.getOperand(0).getReg(); 5742 Register SrcReg0 = MAD->getOperand(1).getReg(); 5743 Register SrcReg1 = MAD->getOperand(2).getReg(); 5744 Register SrcReg2 = MAD->getOperand(3).getReg(); 5745 bool Src0IsKill = MAD->getOperand(1).isKill(); 5746 bool Src1IsKill = MAD->getOperand(2).isKill(); 5747 bool Src2IsKill = MAD->getOperand(3).isKill(); 5748 if (ResultReg.isVirtual()) 5749 MRI.constrainRegClass(ResultReg, RC); 5750 if (SrcReg0.isVirtual()) 5751 MRI.constrainRegClass(SrcReg0, RC); 5752 if (SrcReg1.isVirtual()) 5753 MRI.constrainRegClass(SrcReg1, RC); 5754 if (SrcReg2.isVirtual()) 5755 MRI.constrainRegClass(SrcReg2, RC); 5756 5757 MachineInstrBuilder MIB = 5758 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 5759 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5760 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5761 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5762 InsInstrs.push_back(MIB); 5763 5764 return MAD; 5765 } 5766 5767 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 5768 static MachineInstr * 5769 genIndexedMultiply(MachineInstr &Root, 5770 SmallVectorImpl<MachineInstr *> &InsInstrs, 5771 unsigned IdxDupOp, unsigned MulOpc, 5772 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 5773 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 5774 "Invalid index of FMUL operand"); 5775 5776 MachineFunction &MF = *Root.getMF(); 5777 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5778 5779 MachineInstr *Dup = 5780 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 5781 5782 if (Dup->getOpcode() == TargetOpcode::COPY) 5783 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 5784 5785 Register DupSrcReg = Dup->getOperand(1).getReg(); 5786 MRI.clearKillFlags(DupSrcReg); 5787 MRI.constrainRegClass(DupSrcReg, RC); 5788 5789 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 5790 5791 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 5792 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 5793 5794 Register ResultReg = Root.getOperand(0).getReg(); 5795 5796 MachineInstrBuilder MIB; 5797 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 5798 .add(MulOp) 5799 .addReg(DupSrcReg) 5800 .addImm(DupSrcLane); 5801 5802 InsInstrs.push_back(MIB); 5803 return &Root; 5804 } 5805 5806 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5807 /// instructions. 5808 /// 5809 /// \see genFusedMultiply 5810 static MachineInstr *genFusedMultiplyAcc( 5811 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5812 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5813 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5814 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5815 FMAInstKind::Accumulator); 5816 } 5817 5818 /// genNeg - Helper to generate an intermediate negation of the second operand 5819 /// of Root 5820 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5821 const TargetInstrInfo *TII, MachineInstr &Root, 5822 SmallVectorImpl<MachineInstr *> &InsInstrs, 5823 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5824 unsigned MnegOpc, const TargetRegisterClass *RC) { 5825 Register NewVR = MRI.createVirtualRegister(RC); 5826 MachineInstrBuilder MIB = 5827 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 5828 .add(Root.getOperand(2)); 5829 InsInstrs.push_back(MIB); 5830 5831 assert(InstrIdxForVirtReg.empty()); 5832 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5833 5834 return NewVR; 5835 } 5836 5837 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5838 /// instructions with an additional negation of the accumulator 5839 static MachineInstr *genFusedMultiplyAccNeg( 5840 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5841 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5842 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5843 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5844 assert(IdxMulOpd == 1); 5845 5846 Register NewVR = 5847 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5848 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5849 FMAInstKind::Accumulator, &NewVR); 5850 } 5851 5852 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5853 /// instructions. 5854 /// 5855 /// \see genFusedMultiply 5856 static MachineInstr *genFusedMultiplyIdx( 5857 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5858 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5859 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5860 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5861 FMAInstKind::Indexed); 5862 } 5863 5864 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5865 /// instructions with an additional negation of the accumulator 5866 static MachineInstr *genFusedMultiplyIdxNeg( 5867 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5868 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5869 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5870 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5871 assert(IdxMulOpd == 1); 5872 5873 Register NewVR = 5874 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5875 5876 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5877 FMAInstKind::Indexed, &NewVR); 5878 } 5879 5880 /// genMaddR - Generate madd instruction and combine mul and add using 5881 /// an extra virtual register 5882 /// Example - an ADD intermediate needs to be stored in a register: 5883 /// MUL I=A,B,0 5884 /// ADD R,I,Imm 5885 /// ==> ORR V, ZR, Imm 5886 /// ==> MADD R,A,B,V 5887 /// \param MF Containing MachineFunction 5888 /// \param MRI Register information 5889 /// \param TII Target information 5890 /// \param Root is the ADD instruction 5891 /// \param [out] InsInstrs is a vector of machine instructions and will 5892 /// contain the generated madd instruction 5893 /// \param IdxMulOpd is index of operand in Root that is the result of 5894 /// the MUL. In the example above IdxMulOpd is 1. 5895 /// \param MaddOpc the opcode fo the madd instruction 5896 /// \param VR is a virtual register that holds the value of an ADD operand 5897 /// (V in the example above). 5898 /// \param RC Register class of operands 5899 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5900 const TargetInstrInfo *TII, MachineInstr &Root, 5901 SmallVectorImpl<MachineInstr *> &InsInstrs, 5902 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5903 const TargetRegisterClass *RC) { 5904 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5905 5906 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5907 Register ResultReg = Root.getOperand(0).getReg(); 5908 Register SrcReg0 = MUL->getOperand(1).getReg(); 5909 bool Src0IsKill = MUL->getOperand(1).isKill(); 5910 Register SrcReg1 = MUL->getOperand(2).getReg(); 5911 bool Src1IsKill = MUL->getOperand(2).isKill(); 5912 5913 if (ResultReg.isVirtual()) 5914 MRI.constrainRegClass(ResultReg, RC); 5915 if (SrcReg0.isVirtual()) 5916 MRI.constrainRegClass(SrcReg0, RC); 5917 if (SrcReg1.isVirtual()) 5918 MRI.constrainRegClass(SrcReg1, RC); 5919 if (Register::isVirtualRegister(VR)) 5920 MRI.constrainRegClass(VR, RC); 5921 5922 MachineInstrBuilder MIB = 5923 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5924 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5925 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5926 .addReg(VR); 5927 // Insert the MADD 5928 InsInstrs.push_back(MIB); 5929 return MUL; 5930 } 5931 5932 /// Do the following transformation 5933 /// A - (B + C) ==> (A - B) - C 5934 /// A - (B + C) ==> (A - C) - B 5935 static void 5936 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 5937 const TargetInstrInfo *TII, MachineInstr &Root, 5938 SmallVectorImpl<MachineInstr *> &InsInstrs, 5939 SmallVectorImpl<MachineInstr *> &DelInstrs, 5940 unsigned IdxOpd1, 5941 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 5942 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 5943 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 5944 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 5945 5946 Register ResultReg = Root.getOperand(0).getReg(); 5947 Register RegA = Root.getOperand(1).getReg(); 5948 bool RegAIsKill = Root.getOperand(1).isKill(); 5949 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 5950 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 5951 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 5952 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 5953 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 5954 5955 unsigned Opcode = Root.getOpcode(); 5956 if (Opcode == AArch64::SUBSWrr) 5957 Opcode = AArch64::SUBWrr; 5958 else if (Opcode == AArch64::SUBSXrr) 5959 Opcode = AArch64::SUBXrr; 5960 else 5961 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 5962 "Unexpected instruction opcode."); 5963 5964 MachineInstrBuilder MIB1 = 5965 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 5966 .addReg(RegA, getKillRegState(RegAIsKill)) 5967 .addReg(RegB, getKillRegState(RegBIsKill)); 5968 MachineInstrBuilder MIB2 = 5969 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 5970 .addReg(NewVR, getKillRegState(true)) 5971 .addReg(RegC, getKillRegState(RegCIsKill)); 5972 5973 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5974 InsInstrs.push_back(MIB1); 5975 InsInstrs.push_back(MIB2); 5976 DelInstrs.push_back(AddMI); 5977 } 5978 5979 /// When getMachineCombinerPatterns() finds potential patterns, 5980 /// this function generates the instructions that could replace the 5981 /// original code sequence 5982 void AArch64InstrInfo::genAlternativeCodeSequence( 5983 MachineInstr &Root, MachineCombinerPattern Pattern, 5984 SmallVectorImpl<MachineInstr *> &InsInstrs, 5985 SmallVectorImpl<MachineInstr *> &DelInstrs, 5986 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5987 MachineBasicBlock &MBB = *Root.getParent(); 5988 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5989 MachineFunction &MF = *MBB.getParent(); 5990 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5991 5992 MachineInstr *MUL = nullptr; 5993 const TargetRegisterClass *RC; 5994 unsigned Opc; 5995 switch (Pattern) { 5996 default: 5997 // Reassociate instructions. 5998 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5999 DelInstrs, InstrIdxForVirtReg); 6000 return; 6001 case MachineCombinerPattern::SUBADD_OP1: 6002 // A - (B + C) 6003 // ==> (A - B) - C 6004 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 6005 InstrIdxForVirtReg); 6006 break; 6007 case MachineCombinerPattern::SUBADD_OP2: 6008 // A - (B + C) 6009 // ==> (A - C) - B 6010 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 6011 InstrIdxForVirtReg); 6012 break; 6013 case MachineCombinerPattern::MULADDW_OP1: 6014 case MachineCombinerPattern::MULADDX_OP1: 6015 // MUL I=A,B,0 6016 // ADD R,I,C 6017 // ==> MADD R,A,B,C 6018 // --- Create(MADD); 6019 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 6020 Opc = AArch64::MADDWrrr; 6021 RC = &AArch64::GPR32RegClass; 6022 } else { 6023 Opc = AArch64::MADDXrrr; 6024 RC = &AArch64::GPR64RegClass; 6025 } 6026 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6027 break; 6028 case MachineCombinerPattern::MULADDW_OP2: 6029 case MachineCombinerPattern::MULADDX_OP2: 6030 // MUL I=A,B,0 6031 // ADD R,C,I 6032 // ==> MADD R,A,B,C 6033 // --- Create(MADD); 6034 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 6035 Opc = AArch64::MADDWrrr; 6036 RC = &AArch64::GPR32RegClass; 6037 } else { 6038 Opc = AArch64::MADDXrrr; 6039 RC = &AArch64::GPR64RegClass; 6040 } 6041 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6042 break; 6043 case MachineCombinerPattern::MULADDWI_OP1: 6044 case MachineCombinerPattern::MULADDXI_OP1: { 6045 // MUL I=A,B,0 6046 // ADD R,I,Imm 6047 // ==> MOV V, Imm 6048 // ==> MADD R,A,B,V 6049 // --- Create(MADD); 6050 const TargetRegisterClass *OrrRC; 6051 unsigned BitSize, OrrOpc, ZeroReg; 6052 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 6053 OrrOpc = AArch64::ORRWri; 6054 OrrRC = &AArch64::GPR32spRegClass; 6055 BitSize = 32; 6056 ZeroReg = AArch64::WZR; 6057 Opc = AArch64::MADDWrrr; 6058 RC = &AArch64::GPR32RegClass; 6059 } else { 6060 OrrOpc = AArch64::ORRXri; 6061 OrrRC = &AArch64::GPR64spRegClass; 6062 BitSize = 64; 6063 ZeroReg = AArch64::XZR; 6064 Opc = AArch64::MADDXrrr; 6065 RC = &AArch64::GPR64RegClass; 6066 } 6067 Register NewVR = MRI.createVirtualRegister(OrrRC); 6068 uint64_t Imm = Root.getOperand(2).getImm(); 6069 6070 if (Root.getOperand(3).isImm()) { 6071 unsigned Val = Root.getOperand(3).getImm(); 6072 Imm = Imm << Val; 6073 } 6074 uint64_t UImm = SignExtend64(Imm, BitSize); 6075 // The immediate can be composed via a single instruction. 6076 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6077 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 6078 if (Insn.size() != 1) 6079 return; 6080 auto MovI = Insn.begin(); 6081 MachineInstrBuilder MIB1; 6082 // MOV is an alias for one of three instructions: movz, movn, and orr. 6083 if (MovI->Opcode == OrrOpc) 6084 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 6085 .addReg(ZeroReg) 6086 .addImm(MovI->Op2); 6087 else { 6088 if (BitSize == 32) 6089 assert((MovI->Opcode == AArch64::MOVNWi || 6090 MovI->Opcode == AArch64::MOVZWi) && 6091 "Expected opcode"); 6092 else 6093 assert((MovI->Opcode == AArch64::MOVNXi || 6094 MovI->Opcode == AArch64::MOVZXi) && 6095 "Expected opcode"); 6096 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 6097 .addImm(MovI->Op1) 6098 .addImm(MovI->Op2); 6099 } 6100 InsInstrs.push_back(MIB1); 6101 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6102 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6103 break; 6104 } 6105 case MachineCombinerPattern::MULSUBW_OP1: 6106 case MachineCombinerPattern::MULSUBX_OP1: { 6107 // MUL I=A,B,0 6108 // SUB R,I, C 6109 // ==> SUB V, 0, C 6110 // ==> MADD R,A,B,V // = -C + A*B 6111 // --- Create(MADD); 6112 const TargetRegisterClass *SubRC; 6113 unsigned SubOpc, ZeroReg; 6114 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 6115 SubOpc = AArch64::SUBWrr; 6116 SubRC = &AArch64::GPR32spRegClass; 6117 ZeroReg = AArch64::WZR; 6118 Opc = AArch64::MADDWrrr; 6119 RC = &AArch64::GPR32RegClass; 6120 } else { 6121 SubOpc = AArch64::SUBXrr; 6122 SubRC = &AArch64::GPR64spRegClass; 6123 ZeroReg = AArch64::XZR; 6124 Opc = AArch64::MADDXrrr; 6125 RC = &AArch64::GPR64RegClass; 6126 } 6127 Register NewVR = MRI.createVirtualRegister(SubRC); 6128 // SUB NewVR, 0, C 6129 MachineInstrBuilder MIB1 = 6130 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 6131 .addReg(ZeroReg) 6132 .add(Root.getOperand(2)); 6133 InsInstrs.push_back(MIB1); 6134 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6135 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6136 break; 6137 } 6138 case MachineCombinerPattern::MULSUBW_OP2: 6139 case MachineCombinerPattern::MULSUBX_OP2: 6140 // MUL I=A,B,0 6141 // SUB R,C,I 6142 // ==> MSUB R,A,B,C (computes C - A*B) 6143 // --- Create(MSUB); 6144 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 6145 Opc = AArch64::MSUBWrrr; 6146 RC = &AArch64::GPR32RegClass; 6147 } else { 6148 Opc = AArch64::MSUBXrrr; 6149 RC = &AArch64::GPR64RegClass; 6150 } 6151 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6152 break; 6153 case MachineCombinerPattern::MULSUBWI_OP1: 6154 case MachineCombinerPattern::MULSUBXI_OP1: { 6155 // MUL I=A,B,0 6156 // SUB R,I, Imm 6157 // ==> MOV V, -Imm 6158 // ==> MADD R,A,B,V // = -Imm + A*B 6159 // --- Create(MADD); 6160 const TargetRegisterClass *OrrRC; 6161 unsigned BitSize, OrrOpc, ZeroReg; 6162 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 6163 OrrOpc = AArch64::ORRWri; 6164 OrrRC = &AArch64::GPR32spRegClass; 6165 BitSize = 32; 6166 ZeroReg = AArch64::WZR; 6167 Opc = AArch64::MADDWrrr; 6168 RC = &AArch64::GPR32RegClass; 6169 } else { 6170 OrrOpc = AArch64::ORRXri; 6171 OrrRC = &AArch64::GPR64spRegClass; 6172 BitSize = 64; 6173 ZeroReg = AArch64::XZR; 6174 Opc = AArch64::MADDXrrr; 6175 RC = &AArch64::GPR64RegClass; 6176 } 6177 Register NewVR = MRI.createVirtualRegister(OrrRC); 6178 uint64_t Imm = Root.getOperand(2).getImm(); 6179 if (Root.getOperand(3).isImm()) { 6180 unsigned Val = Root.getOperand(3).getImm(); 6181 Imm = Imm << Val; 6182 } 6183 uint64_t UImm = SignExtend64(-Imm, BitSize); 6184 // The immediate can be composed via a single instruction. 6185 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6186 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 6187 if (Insn.size() != 1) 6188 return; 6189 auto MovI = Insn.begin(); 6190 MachineInstrBuilder MIB1; 6191 // MOV is an alias for one of three instructions: movz, movn, and orr. 6192 if (MovI->Opcode == OrrOpc) 6193 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 6194 .addReg(ZeroReg) 6195 .addImm(MovI->Op2); 6196 else { 6197 if (BitSize == 32) 6198 assert((MovI->Opcode == AArch64::MOVNWi || 6199 MovI->Opcode == AArch64::MOVZWi) && 6200 "Expected opcode"); 6201 else 6202 assert((MovI->Opcode == AArch64::MOVNXi || 6203 MovI->Opcode == AArch64::MOVZXi) && 6204 "Expected opcode"); 6205 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 6206 .addImm(MovI->Op1) 6207 .addImm(MovI->Op2); 6208 } 6209 InsInstrs.push_back(MIB1); 6210 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6211 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6212 break; 6213 } 6214 6215 case MachineCombinerPattern::MULADDv8i8_OP1: 6216 Opc = AArch64::MLAv8i8; 6217 RC = &AArch64::FPR64RegClass; 6218 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6219 break; 6220 case MachineCombinerPattern::MULADDv8i8_OP2: 6221 Opc = AArch64::MLAv8i8; 6222 RC = &AArch64::FPR64RegClass; 6223 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6224 break; 6225 case MachineCombinerPattern::MULADDv16i8_OP1: 6226 Opc = AArch64::MLAv16i8; 6227 RC = &AArch64::FPR128RegClass; 6228 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6229 break; 6230 case MachineCombinerPattern::MULADDv16i8_OP2: 6231 Opc = AArch64::MLAv16i8; 6232 RC = &AArch64::FPR128RegClass; 6233 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6234 break; 6235 case MachineCombinerPattern::MULADDv4i16_OP1: 6236 Opc = AArch64::MLAv4i16; 6237 RC = &AArch64::FPR64RegClass; 6238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6239 break; 6240 case MachineCombinerPattern::MULADDv4i16_OP2: 6241 Opc = AArch64::MLAv4i16; 6242 RC = &AArch64::FPR64RegClass; 6243 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6244 break; 6245 case MachineCombinerPattern::MULADDv8i16_OP1: 6246 Opc = AArch64::MLAv8i16; 6247 RC = &AArch64::FPR128RegClass; 6248 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6249 break; 6250 case MachineCombinerPattern::MULADDv8i16_OP2: 6251 Opc = AArch64::MLAv8i16; 6252 RC = &AArch64::FPR128RegClass; 6253 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6254 break; 6255 case MachineCombinerPattern::MULADDv2i32_OP1: 6256 Opc = AArch64::MLAv2i32; 6257 RC = &AArch64::FPR64RegClass; 6258 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6259 break; 6260 case MachineCombinerPattern::MULADDv2i32_OP2: 6261 Opc = AArch64::MLAv2i32; 6262 RC = &AArch64::FPR64RegClass; 6263 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6264 break; 6265 case MachineCombinerPattern::MULADDv4i32_OP1: 6266 Opc = AArch64::MLAv4i32; 6267 RC = &AArch64::FPR128RegClass; 6268 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6269 break; 6270 case MachineCombinerPattern::MULADDv4i32_OP2: 6271 Opc = AArch64::MLAv4i32; 6272 RC = &AArch64::FPR128RegClass; 6273 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6274 break; 6275 6276 case MachineCombinerPattern::MULSUBv8i8_OP1: 6277 Opc = AArch64::MLAv8i8; 6278 RC = &AArch64::FPR64RegClass; 6279 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6280 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 6281 RC); 6282 break; 6283 case MachineCombinerPattern::MULSUBv8i8_OP2: 6284 Opc = AArch64::MLSv8i8; 6285 RC = &AArch64::FPR64RegClass; 6286 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6287 break; 6288 case MachineCombinerPattern::MULSUBv16i8_OP1: 6289 Opc = AArch64::MLAv16i8; 6290 RC = &AArch64::FPR128RegClass; 6291 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6292 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 6293 RC); 6294 break; 6295 case MachineCombinerPattern::MULSUBv16i8_OP2: 6296 Opc = AArch64::MLSv16i8; 6297 RC = &AArch64::FPR128RegClass; 6298 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6299 break; 6300 case MachineCombinerPattern::MULSUBv4i16_OP1: 6301 Opc = AArch64::MLAv4i16; 6302 RC = &AArch64::FPR64RegClass; 6303 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6304 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6305 RC); 6306 break; 6307 case MachineCombinerPattern::MULSUBv4i16_OP2: 6308 Opc = AArch64::MLSv4i16; 6309 RC = &AArch64::FPR64RegClass; 6310 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6311 break; 6312 case MachineCombinerPattern::MULSUBv8i16_OP1: 6313 Opc = AArch64::MLAv8i16; 6314 RC = &AArch64::FPR128RegClass; 6315 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6316 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6317 RC); 6318 break; 6319 case MachineCombinerPattern::MULSUBv8i16_OP2: 6320 Opc = AArch64::MLSv8i16; 6321 RC = &AArch64::FPR128RegClass; 6322 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6323 break; 6324 case MachineCombinerPattern::MULSUBv2i32_OP1: 6325 Opc = AArch64::MLAv2i32; 6326 RC = &AArch64::FPR64RegClass; 6327 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6328 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6329 RC); 6330 break; 6331 case MachineCombinerPattern::MULSUBv2i32_OP2: 6332 Opc = AArch64::MLSv2i32; 6333 RC = &AArch64::FPR64RegClass; 6334 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6335 break; 6336 case MachineCombinerPattern::MULSUBv4i32_OP1: 6337 Opc = AArch64::MLAv4i32; 6338 RC = &AArch64::FPR128RegClass; 6339 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6340 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6341 RC); 6342 break; 6343 case MachineCombinerPattern::MULSUBv4i32_OP2: 6344 Opc = AArch64::MLSv4i32; 6345 RC = &AArch64::FPR128RegClass; 6346 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6347 break; 6348 6349 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6350 Opc = AArch64::MLAv4i16_indexed; 6351 RC = &AArch64::FPR64RegClass; 6352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6353 break; 6354 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6355 Opc = AArch64::MLAv4i16_indexed; 6356 RC = &AArch64::FPR64RegClass; 6357 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6358 break; 6359 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6360 Opc = AArch64::MLAv8i16_indexed; 6361 RC = &AArch64::FPR128RegClass; 6362 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6363 break; 6364 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6365 Opc = AArch64::MLAv8i16_indexed; 6366 RC = &AArch64::FPR128RegClass; 6367 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6368 break; 6369 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6370 Opc = AArch64::MLAv2i32_indexed; 6371 RC = &AArch64::FPR64RegClass; 6372 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6373 break; 6374 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6375 Opc = AArch64::MLAv2i32_indexed; 6376 RC = &AArch64::FPR64RegClass; 6377 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6378 break; 6379 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6380 Opc = AArch64::MLAv4i32_indexed; 6381 RC = &AArch64::FPR128RegClass; 6382 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6383 break; 6384 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6385 Opc = AArch64::MLAv4i32_indexed; 6386 RC = &AArch64::FPR128RegClass; 6387 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6388 break; 6389 6390 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6391 Opc = AArch64::MLAv4i16_indexed; 6392 RC = &AArch64::FPR64RegClass; 6393 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6394 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6395 RC); 6396 break; 6397 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6398 Opc = AArch64::MLSv4i16_indexed; 6399 RC = &AArch64::FPR64RegClass; 6400 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6401 break; 6402 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6403 Opc = AArch64::MLAv8i16_indexed; 6404 RC = &AArch64::FPR128RegClass; 6405 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6406 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6407 RC); 6408 break; 6409 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6410 Opc = AArch64::MLSv8i16_indexed; 6411 RC = &AArch64::FPR128RegClass; 6412 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6413 break; 6414 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6415 Opc = AArch64::MLAv2i32_indexed; 6416 RC = &AArch64::FPR64RegClass; 6417 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6418 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6419 RC); 6420 break; 6421 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6422 Opc = AArch64::MLSv2i32_indexed; 6423 RC = &AArch64::FPR64RegClass; 6424 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6425 break; 6426 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6427 Opc = AArch64::MLAv4i32_indexed; 6428 RC = &AArch64::FPR128RegClass; 6429 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6430 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6431 RC); 6432 break; 6433 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6434 Opc = AArch64::MLSv4i32_indexed; 6435 RC = &AArch64::FPR128RegClass; 6436 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6437 break; 6438 6439 // Floating Point Support 6440 case MachineCombinerPattern::FMULADDH_OP1: 6441 Opc = AArch64::FMADDHrrr; 6442 RC = &AArch64::FPR16RegClass; 6443 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6444 break; 6445 case MachineCombinerPattern::FMULADDS_OP1: 6446 Opc = AArch64::FMADDSrrr; 6447 RC = &AArch64::FPR32RegClass; 6448 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6449 break; 6450 case MachineCombinerPattern::FMULADDD_OP1: 6451 Opc = AArch64::FMADDDrrr; 6452 RC = &AArch64::FPR64RegClass; 6453 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6454 break; 6455 6456 case MachineCombinerPattern::FMULADDH_OP2: 6457 Opc = AArch64::FMADDHrrr; 6458 RC = &AArch64::FPR16RegClass; 6459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6460 break; 6461 case MachineCombinerPattern::FMULADDS_OP2: 6462 Opc = AArch64::FMADDSrrr; 6463 RC = &AArch64::FPR32RegClass; 6464 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6465 break; 6466 case MachineCombinerPattern::FMULADDD_OP2: 6467 Opc = AArch64::FMADDDrrr; 6468 RC = &AArch64::FPR64RegClass; 6469 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6470 break; 6471 6472 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6473 Opc = AArch64::FMLAv1i32_indexed; 6474 RC = &AArch64::FPR32RegClass; 6475 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6476 FMAInstKind::Indexed); 6477 break; 6478 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6479 Opc = AArch64::FMLAv1i32_indexed; 6480 RC = &AArch64::FPR32RegClass; 6481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6482 FMAInstKind::Indexed); 6483 break; 6484 6485 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6486 Opc = AArch64::FMLAv1i64_indexed; 6487 RC = &AArch64::FPR64RegClass; 6488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6489 FMAInstKind::Indexed); 6490 break; 6491 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6492 Opc = AArch64::FMLAv1i64_indexed; 6493 RC = &AArch64::FPR64RegClass; 6494 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6495 FMAInstKind::Indexed); 6496 break; 6497 6498 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6499 RC = &AArch64::FPR64RegClass; 6500 Opc = AArch64::FMLAv4i16_indexed; 6501 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6502 FMAInstKind::Indexed); 6503 break; 6504 case MachineCombinerPattern::FMLAv4f16_OP1: 6505 RC = &AArch64::FPR64RegClass; 6506 Opc = AArch64::FMLAv4f16; 6507 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6508 FMAInstKind::Accumulator); 6509 break; 6510 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6511 RC = &AArch64::FPR64RegClass; 6512 Opc = AArch64::FMLAv4i16_indexed; 6513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6514 FMAInstKind::Indexed); 6515 break; 6516 case MachineCombinerPattern::FMLAv4f16_OP2: 6517 RC = &AArch64::FPR64RegClass; 6518 Opc = AArch64::FMLAv4f16; 6519 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6520 FMAInstKind::Accumulator); 6521 break; 6522 6523 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6524 case MachineCombinerPattern::FMLAv2f32_OP1: 6525 RC = &AArch64::FPR64RegClass; 6526 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 6527 Opc = AArch64::FMLAv2i32_indexed; 6528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6529 FMAInstKind::Indexed); 6530 } else { 6531 Opc = AArch64::FMLAv2f32; 6532 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6533 FMAInstKind::Accumulator); 6534 } 6535 break; 6536 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6537 case MachineCombinerPattern::FMLAv2f32_OP2: 6538 RC = &AArch64::FPR64RegClass; 6539 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 6540 Opc = AArch64::FMLAv2i32_indexed; 6541 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6542 FMAInstKind::Indexed); 6543 } else { 6544 Opc = AArch64::FMLAv2f32; 6545 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6546 FMAInstKind::Accumulator); 6547 } 6548 break; 6549 6550 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6551 RC = &AArch64::FPR128RegClass; 6552 Opc = AArch64::FMLAv8i16_indexed; 6553 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6554 FMAInstKind::Indexed); 6555 break; 6556 case MachineCombinerPattern::FMLAv8f16_OP1: 6557 RC = &AArch64::FPR128RegClass; 6558 Opc = AArch64::FMLAv8f16; 6559 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6560 FMAInstKind::Accumulator); 6561 break; 6562 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6563 RC = &AArch64::FPR128RegClass; 6564 Opc = AArch64::FMLAv8i16_indexed; 6565 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6566 FMAInstKind::Indexed); 6567 break; 6568 case MachineCombinerPattern::FMLAv8f16_OP2: 6569 RC = &AArch64::FPR128RegClass; 6570 Opc = AArch64::FMLAv8f16; 6571 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6572 FMAInstKind::Accumulator); 6573 break; 6574 6575 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6576 case MachineCombinerPattern::FMLAv2f64_OP1: 6577 RC = &AArch64::FPR128RegClass; 6578 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 6579 Opc = AArch64::FMLAv2i64_indexed; 6580 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6581 FMAInstKind::Indexed); 6582 } else { 6583 Opc = AArch64::FMLAv2f64; 6584 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6585 FMAInstKind::Accumulator); 6586 } 6587 break; 6588 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6589 case MachineCombinerPattern::FMLAv2f64_OP2: 6590 RC = &AArch64::FPR128RegClass; 6591 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 6592 Opc = AArch64::FMLAv2i64_indexed; 6593 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6594 FMAInstKind::Indexed); 6595 } else { 6596 Opc = AArch64::FMLAv2f64; 6597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6598 FMAInstKind::Accumulator); 6599 } 6600 break; 6601 6602 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6603 case MachineCombinerPattern::FMLAv4f32_OP1: 6604 RC = &AArch64::FPR128RegClass; 6605 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 6606 Opc = AArch64::FMLAv4i32_indexed; 6607 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6608 FMAInstKind::Indexed); 6609 } else { 6610 Opc = AArch64::FMLAv4f32; 6611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6612 FMAInstKind::Accumulator); 6613 } 6614 break; 6615 6616 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6617 case MachineCombinerPattern::FMLAv4f32_OP2: 6618 RC = &AArch64::FPR128RegClass; 6619 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 6620 Opc = AArch64::FMLAv4i32_indexed; 6621 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6622 FMAInstKind::Indexed); 6623 } else { 6624 Opc = AArch64::FMLAv4f32; 6625 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6626 FMAInstKind::Accumulator); 6627 } 6628 break; 6629 6630 case MachineCombinerPattern::FMULSUBH_OP1: 6631 Opc = AArch64::FNMSUBHrrr; 6632 RC = &AArch64::FPR16RegClass; 6633 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6634 break; 6635 case MachineCombinerPattern::FMULSUBS_OP1: 6636 Opc = AArch64::FNMSUBSrrr; 6637 RC = &AArch64::FPR32RegClass; 6638 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6639 break; 6640 case MachineCombinerPattern::FMULSUBD_OP1: 6641 Opc = AArch64::FNMSUBDrrr; 6642 RC = &AArch64::FPR64RegClass; 6643 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6644 break; 6645 6646 case MachineCombinerPattern::FNMULSUBH_OP1: 6647 Opc = AArch64::FNMADDHrrr; 6648 RC = &AArch64::FPR16RegClass; 6649 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6650 break; 6651 case MachineCombinerPattern::FNMULSUBS_OP1: 6652 Opc = AArch64::FNMADDSrrr; 6653 RC = &AArch64::FPR32RegClass; 6654 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6655 break; 6656 case MachineCombinerPattern::FNMULSUBD_OP1: 6657 Opc = AArch64::FNMADDDrrr; 6658 RC = &AArch64::FPR64RegClass; 6659 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6660 break; 6661 6662 case MachineCombinerPattern::FMULSUBH_OP2: 6663 Opc = AArch64::FMSUBHrrr; 6664 RC = &AArch64::FPR16RegClass; 6665 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6666 break; 6667 case MachineCombinerPattern::FMULSUBS_OP2: 6668 Opc = AArch64::FMSUBSrrr; 6669 RC = &AArch64::FPR32RegClass; 6670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6671 break; 6672 case MachineCombinerPattern::FMULSUBD_OP2: 6673 Opc = AArch64::FMSUBDrrr; 6674 RC = &AArch64::FPR64RegClass; 6675 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6676 break; 6677 6678 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6679 Opc = AArch64::FMLSv1i32_indexed; 6680 RC = &AArch64::FPR32RegClass; 6681 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6682 FMAInstKind::Indexed); 6683 break; 6684 6685 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6686 Opc = AArch64::FMLSv1i64_indexed; 6687 RC = &AArch64::FPR64RegClass; 6688 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6689 FMAInstKind::Indexed); 6690 break; 6691 6692 case MachineCombinerPattern::FMLSv4f16_OP1: 6693 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 6694 RC = &AArch64::FPR64RegClass; 6695 Register NewVR = MRI.createVirtualRegister(RC); 6696 MachineInstrBuilder MIB1 = 6697 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 6698 .add(Root.getOperand(2)); 6699 InsInstrs.push_back(MIB1); 6700 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6701 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 6702 Opc = AArch64::FMLAv4f16; 6703 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6704 FMAInstKind::Accumulator, &NewVR); 6705 } else { 6706 Opc = AArch64::FMLAv4i16_indexed; 6707 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6708 FMAInstKind::Indexed, &NewVR); 6709 } 6710 break; 6711 } 6712 case MachineCombinerPattern::FMLSv4f16_OP2: 6713 RC = &AArch64::FPR64RegClass; 6714 Opc = AArch64::FMLSv4f16; 6715 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6716 FMAInstKind::Accumulator); 6717 break; 6718 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6719 RC = &AArch64::FPR64RegClass; 6720 Opc = AArch64::FMLSv4i16_indexed; 6721 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6722 FMAInstKind::Indexed); 6723 break; 6724 6725 case MachineCombinerPattern::FMLSv2f32_OP2: 6726 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6727 RC = &AArch64::FPR64RegClass; 6728 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 6729 Opc = AArch64::FMLSv2i32_indexed; 6730 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6731 FMAInstKind::Indexed); 6732 } else { 6733 Opc = AArch64::FMLSv2f32; 6734 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6735 FMAInstKind::Accumulator); 6736 } 6737 break; 6738 6739 case MachineCombinerPattern::FMLSv8f16_OP1: 6740 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 6741 RC = &AArch64::FPR128RegClass; 6742 Register NewVR = MRI.createVirtualRegister(RC); 6743 MachineInstrBuilder MIB1 = 6744 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 6745 .add(Root.getOperand(2)); 6746 InsInstrs.push_back(MIB1); 6747 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6748 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 6749 Opc = AArch64::FMLAv8f16; 6750 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6751 FMAInstKind::Accumulator, &NewVR); 6752 } else { 6753 Opc = AArch64::FMLAv8i16_indexed; 6754 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6755 FMAInstKind::Indexed, &NewVR); 6756 } 6757 break; 6758 } 6759 case MachineCombinerPattern::FMLSv8f16_OP2: 6760 RC = &AArch64::FPR128RegClass; 6761 Opc = AArch64::FMLSv8f16; 6762 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6763 FMAInstKind::Accumulator); 6764 break; 6765 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6766 RC = &AArch64::FPR128RegClass; 6767 Opc = AArch64::FMLSv8i16_indexed; 6768 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6769 FMAInstKind::Indexed); 6770 break; 6771 6772 case MachineCombinerPattern::FMLSv2f64_OP2: 6773 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6774 RC = &AArch64::FPR128RegClass; 6775 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 6776 Opc = AArch64::FMLSv2i64_indexed; 6777 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6778 FMAInstKind::Indexed); 6779 } else { 6780 Opc = AArch64::FMLSv2f64; 6781 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6782 FMAInstKind::Accumulator); 6783 } 6784 break; 6785 6786 case MachineCombinerPattern::FMLSv4f32_OP2: 6787 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6788 RC = &AArch64::FPR128RegClass; 6789 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 6790 Opc = AArch64::FMLSv4i32_indexed; 6791 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6792 FMAInstKind::Indexed); 6793 } else { 6794 Opc = AArch64::FMLSv4f32; 6795 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6796 FMAInstKind::Accumulator); 6797 } 6798 break; 6799 case MachineCombinerPattern::FMLSv2f32_OP1: 6800 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 6801 RC = &AArch64::FPR64RegClass; 6802 Register NewVR = MRI.createVirtualRegister(RC); 6803 MachineInstrBuilder MIB1 = 6804 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 6805 .add(Root.getOperand(2)); 6806 InsInstrs.push_back(MIB1); 6807 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6808 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 6809 Opc = AArch64::FMLAv2i32_indexed; 6810 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6811 FMAInstKind::Indexed, &NewVR); 6812 } else { 6813 Opc = AArch64::FMLAv2f32; 6814 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6815 FMAInstKind::Accumulator, &NewVR); 6816 } 6817 break; 6818 } 6819 case MachineCombinerPattern::FMLSv4f32_OP1: 6820 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 6821 RC = &AArch64::FPR128RegClass; 6822 Register NewVR = MRI.createVirtualRegister(RC); 6823 MachineInstrBuilder MIB1 = 6824 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 6825 .add(Root.getOperand(2)); 6826 InsInstrs.push_back(MIB1); 6827 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6828 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 6829 Opc = AArch64::FMLAv4i32_indexed; 6830 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6831 FMAInstKind::Indexed, &NewVR); 6832 } else { 6833 Opc = AArch64::FMLAv4f32; 6834 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6835 FMAInstKind::Accumulator, &NewVR); 6836 } 6837 break; 6838 } 6839 case MachineCombinerPattern::FMLSv2f64_OP1: 6840 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6841 RC = &AArch64::FPR128RegClass; 6842 Register NewVR = MRI.createVirtualRegister(RC); 6843 MachineInstrBuilder MIB1 = 6844 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 6845 .add(Root.getOperand(2)); 6846 InsInstrs.push_back(MIB1); 6847 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6848 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6849 Opc = AArch64::FMLAv2i64_indexed; 6850 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6851 FMAInstKind::Indexed, &NewVR); 6852 } else { 6853 Opc = AArch64::FMLAv2f64; 6854 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6855 FMAInstKind::Accumulator, &NewVR); 6856 } 6857 break; 6858 } 6859 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 6860 case MachineCombinerPattern::FMULv2i32_indexed_OP2: { 6861 unsigned IdxDupOp = 6862 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; 6863 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 6864 &AArch64::FPR128RegClass, MRI); 6865 break; 6866 } 6867 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 6868 case MachineCombinerPattern::FMULv2i64_indexed_OP2: { 6869 unsigned IdxDupOp = 6870 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; 6871 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 6872 &AArch64::FPR128RegClass, MRI); 6873 break; 6874 } 6875 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 6876 case MachineCombinerPattern::FMULv4i16_indexed_OP2: { 6877 unsigned IdxDupOp = 6878 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; 6879 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 6880 &AArch64::FPR128_loRegClass, MRI); 6881 break; 6882 } 6883 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 6884 case MachineCombinerPattern::FMULv4i32_indexed_OP2: { 6885 unsigned IdxDupOp = 6886 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; 6887 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 6888 &AArch64::FPR128RegClass, MRI); 6889 break; 6890 } 6891 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 6892 case MachineCombinerPattern::FMULv8i16_indexed_OP2: { 6893 unsigned IdxDupOp = 6894 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; 6895 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 6896 &AArch64::FPR128_loRegClass, MRI); 6897 break; 6898 } 6899 case MachineCombinerPattern::FNMADD: { 6900 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 6901 break; 6902 } 6903 6904 } // end switch (Pattern) 6905 // Record MUL and ADD/SUB for deletion 6906 if (MUL) 6907 DelInstrs.push_back(MUL); 6908 DelInstrs.push_back(&Root); 6909 6910 // Set the flags on the inserted instructions to be the merged flags of the 6911 // instructions that we have combined. 6912 uint32_t Flags = Root.getFlags(); 6913 if (MUL) 6914 Flags = Root.mergeFlagsWith(*MUL); 6915 for (auto *MI : InsInstrs) 6916 MI->setFlags(Flags); 6917 } 6918 6919 /// Replace csincr-branch sequence by simple conditional branch 6920 /// 6921 /// Examples: 6922 /// 1. \code 6923 /// csinc w9, wzr, wzr, <condition code> 6924 /// tbnz w9, #0, 0x44 6925 /// \endcode 6926 /// to 6927 /// \code 6928 /// b.<inverted condition code> 6929 /// \endcode 6930 /// 6931 /// 2. \code 6932 /// csinc w9, wzr, wzr, <condition code> 6933 /// tbz w9, #0, 0x44 6934 /// \endcode 6935 /// to 6936 /// \code 6937 /// b.<condition code> 6938 /// \endcode 6939 /// 6940 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6941 /// compare's constant operand is power of 2. 6942 /// 6943 /// Examples: 6944 /// \code 6945 /// and w8, w8, #0x400 6946 /// cbnz w8, L1 6947 /// \endcode 6948 /// to 6949 /// \code 6950 /// tbnz w8, #10, L1 6951 /// \endcode 6952 /// 6953 /// \param MI Conditional Branch 6954 /// \return True when the simple conditional branch is generated 6955 /// 6956 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6957 bool IsNegativeBranch = false; 6958 bool IsTestAndBranch = false; 6959 unsigned TargetBBInMI = 0; 6960 switch (MI.getOpcode()) { 6961 default: 6962 llvm_unreachable("Unknown branch instruction?"); 6963 case AArch64::Bcc: 6964 return false; 6965 case AArch64::CBZW: 6966 case AArch64::CBZX: 6967 TargetBBInMI = 1; 6968 break; 6969 case AArch64::CBNZW: 6970 case AArch64::CBNZX: 6971 TargetBBInMI = 1; 6972 IsNegativeBranch = true; 6973 break; 6974 case AArch64::TBZW: 6975 case AArch64::TBZX: 6976 TargetBBInMI = 2; 6977 IsTestAndBranch = true; 6978 break; 6979 case AArch64::TBNZW: 6980 case AArch64::TBNZX: 6981 TargetBBInMI = 2; 6982 IsNegativeBranch = true; 6983 IsTestAndBranch = true; 6984 break; 6985 } 6986 // So we increment a zero register and test for bits other 6987 // than bit 0? Conservatively bail out in case the verifier 6988 // missed this case. 6989 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6990 return false; 6991 6992 // Find Definition. 6993 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6994 MachineBasicBlock *MBB = MI.getParent(); 6995 MachineFunction *MF = MBB->getParent(); 6996 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6997 Register VReg = MI.getOperand(0).getReg(); 6998 if (!VReg.isVirtual()) 6999 return false; 7000 7001 MachineInstr *DefMI = MRI->getVRegDef(VReg); 7002 7003 // Look through COPY instructions to find definition. 7004 while (DefMI->isCopy()) { 7005 Register CopyVReg = DefMI->getOperand(1).getReg(); 7006 if (!MRI->hasOneNonDBGUse(CopyVReg)) 7007 return false; 7008 if (!MRI->hasOneDef(CopyVReg)) 7009 return false; 7010 DefMI = MRI->getVRegDef(CopyVReg); 7011 } 7012 7013 switch (DefMI->getOpcode()) { 7014 default: 7015 return false; 7016 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 7017 case AArch64::ANDWri: 7018 case AArch64::ANDXri: { 7019 if (IsTestAndBranch) 7020 return false; 7021 if (DefMI->getParent() != MBB) 7022 return false; 7023 if (!MRI->hasOneNonDBGUse(VReg)) 7024 return false; 7025 7026 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 7027 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 7028 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 7029 if (!isPowerOf2_64(Mask)) 7030 return false; 7031 7032 MachineOperand &MO = DefMI->getOperand(1); 7033 Register NewReg = MO.getReg(); 7034 if (!NewReg.isVirtual()) 7035 return false; 7036 7037 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 7038 7039 MachineBasicBlock &RefToMBB = *MBB; 7040 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 7041 DebugLoc DL = MI.getDebugLoc(); 7042 unsigned Imm = Log2_64(Mask); 7043 unsigned Opc = (Imm < 32) 7044 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 7045 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 7046 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 7047 .addReg(NewReg) 7048 .addImm(Imm) 7049 .addMBB(TBB); 7050 // Register lives on to the CBZ now. 7051 MO.setIsKill(false); 7052 7053 // For immediate smaller than 32, we need to use the 32-bit 7054 // variant (W) in all cases. Indeed the 64-bit variant does not 7055 // allow to encode them. 7056 // Therefore, if the input register is 64-bit, we need to take the 7057 // 32-bit sub-part. 7058 if (!Is32Bit && Imm < 32) 7059 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 7060 MI.eraseFromParent(); 7061 return true; 7062 } 7063 // Look for CSINC 7064 case AArch64::CSINCWr: 7065 case AArch64::CSINCXr: { 7066 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 7067 DefMI->getOperand(2).getReg() == AArch64::WZR) && 7068 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 7069 DefMI->getOperand(2).getReg() == AArch64::XZR)) 7070 return false; 7071 7072 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 7073 return false; 7074 7075 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 7076 // Convert only when the condition code is not modified between 7077 // the CSINC and the branch. The CC may be used by other 7078 // instructions in between. 7079 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 7080 return false; 7081 MachineBasicBlock &RefToMBB = *MBB; 7082 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 7083 DebugLoc DL = MI.getDebugLoc(); 7084 if (IsNegativeBranch) 7085 CC = AArch64CC::getInvertedCondCode(CC); 7086 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 7087 MI.eraseFromParent(); 7088 return true; 7089 } 7090 } 7091 } 7092 7093 std::pair<unsigned, unsigned> 7094 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 7095 const unsigned Mask = AArch64II::MO_FRAGMENT; 7096 return std::make_pair(TF & Mask, TF & ~Mask); 7097 } 7098 7099 ArrayRef<std::pair<unsigned, const char *>> 7100 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 7101 using namespace AArch64II; 7102 7103 static const std::pair<unsigned, const char *> TargetFlags[] = { 7104 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 7105 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 7106 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 7107 {MO_HI12, "aarch64-hi12"}}; 7108 return ArrayRef(TargetFlags); 7109 } 7110 7111 ArrayRef<std::pair<unsigned, const char *>> 7112 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 7113 using namespace AArch64II; 7114 7115 static const std::pair<unsigned, const char *> TargetFlags[] = { 7116 {MO_COFFSTUB, "aarch64-coffstub"}, 7117 {MO_GOT, "aarch64-got"}, 7118 {MO_NC, "aarch64-nc"}, 7119 {MO_S, "aarch64-s"}, 7120 {MO_TLS, "aarch64-tls"}, 7121 {MO_DLLIMPORT, "aarch64-dllimport"}, 7122 {MO_DLLIMPORTAUX, "aarch64-dllimportaux"}, 7123 {MO_PREL, "aarch64-prel"}, 7124 {MO_TAGGED, "aarch64-tagged"}}; 7125 return ArrayRef(TargetFlags); 7126 } 7127 7128 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 7129 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 7130 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 7131 {{MOSuppressPair, "aarch64-suppress-pair"}, 7132 {MOStridedAccess, "aarch64-strided-access"}}; 7133 return ArrayRef(TargetFlags); 7134 } 7135 7136 /// Constants defining how certain sequences should be outlined. 7137 /// This encompasses how an outlined function should be called, and what kind of 7138 /// frame should be emitted for that outlined function. 7139 /// 7140 /// \p MachineOutlinerDefault implies that the function should be called with 7141 /// a save and restore of LR to the stack. 7142 /// 7143 /// That is, 7144 /// 7145 /// I1 Save LR OUTLINED_FUNCTION: 7146 /// I2 --> BL OUTLINED_FUNCTION I1 7147 /// I3 Restore LR I2 7148 /// I3 7149 /// RET 7150 /// 7151 /// * Call construction overhead: 3 (save + BL + restore) 7152 /// * Frame construction overhead: 1 (ret) 7153 /// * Requires stack fixups? Yes 7154 /// 7155 /// \p MachineOutlinerTailCall implies that the function is being created from 7156 /// a sequence of instructions ending in a return. 7157 /// 7158 /// That is, 7159 /// 7160 /// I1 OUTLINED_FUNCTION: 7161 /// I2 --> B OUTLINED_FUNCTION I1 7162 /// RET I2 7163 /// RET 7164 /// 7165 /// * Call construction overhead: 1 (B) 7166 /// * Frame construction overhead: 0 (Return included in sequence) 7167 /// * Requires stack fixups? No 7168 /// 7169 /// \p MachineOutlinerNoLRSave implies that the function should be called using 7170 /// a BL instruction, but doesn't require LR to be saved and restored. This 7171 /// happens when LR is known to be dead. 7172 /// 7173 /// That is, 7174 /// 7175 /// I1 OUTLINED_FUNCTION: 7176 /// I2 --> BL OUTLINED_FUNCTION I1 7177 /// I3 I2 7178 /// I3 7179 /// RET 7180 /// 7181 /// * Call construction overhead: 1 (BL) 7182 /// * Frame construction overhead: 1 (RET) 7183 /// * Requires stack fixups? No 7184 /// 7185 /// \p MachineOutlinerThunk implies that the function is being created from 7186 /// a sequence of instructions ending in a call. The outlined function is 7187 /// called with a BL instruction, and the outlined function tail-calls the 7188 /// original call destination. 7189 /// 7190 /// That is, 7191 /// 7192 /// I1 OUTLINED_FUNCTION: 7193 /// I2 --> BL OUTLINED_FUNCTION I1 7194 /// BL f I2 7195 /// B f 7196 /// * Call construction overhead: 1 (BL) 7197 /// * Frame construction overhead: 0 7198 /// * Requires stack fixups? No 7199 /// 7200 /// \p MachineOutlinerRegSave implies that the function should be called with a 7201 /// save and restore of LR to an available register. This allows us to avoid 7202 /// stack fixups. Note that this outlining variant is compatible with the 7203 /// NoLRSave case. 7204 /// 7205 /// That is, 7206 /// 7207 /// I1 Save LR OUTLINED_FUNCTION: 7208 /// I2 --> BL OUTLINED_FUNCTION I1 7209 /// I3 Restore LR I2 7210 /// I3 7211 /// RET 7212 /// 7213 /// * Call construction overhead: 3 (save + BL + restore) 7214 /// * Frame construction overhead: 1 (ret) 7215 /// * Requires stack fixups? No 7216 enum MachineOutlinerClass { 7217 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 7218 MachineOutlinerTailCall, /// Only emit a branch. 7219 MachineOutlinerNoLRSave, /// Emit a call and return. 7220 MachineOutlinerThunk, /// Emit a call and tail-call. 7221 MachineOutlinerRegSave /// Same as default, but save to a register. 7222 }; 7223 7224 enum MachineOutlinerMBBFlags { 7225 LRUnavailableSomewhere = 0x2, 7226 HasCalls = 0x4, 7227 UnsafeRegsDead = 0x8 7228 }; 7229 7230 Register 7231 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 7232 MachineFunction *MF = C.getMF(); 7233 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 7234 const AArch64RegisterInfo *ARI = 7235 static_cast<const AArch64RegisterInfo *>(&TRI); 7236 // Check if there is an available register across the sequence that we can 7237 // use. 7238 for (unsigned Reg : AArch64::GPR64RegClass) { 7239 if (!ARI->isReservedReg(*MF, Reg) && 7240 Reg != AArch64::LR && // LR is not reserved, but don't use it. 7241 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 7242 Reg != AArch64::X17 && // Ditto for X17. 7243 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 7244 C.isAvailableInsideSeq(Reg, TRI)) 7245 return Reg; 7246 } 7247 return Register(); 7248 } 7249 7250 static bool 7251 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 7252 const outliner::Candidate &b) { 7253 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7254 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7255 7256 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 7257 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 7258 } 7259 7260 static bool 7261 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 7262 const outliner::Candidate &b) { 7263 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7264 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7265 7266 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 7267 } 7268 7269 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 7270 const outliner::Candidate &b) { 7271 const AArch64Subtarget &SubtargetA = 7272 a.getMF()->getSubtarget<AArch64Subtarget>(); 7273 const AArch64Subtarget &SubtargetB = 7274 b.getMF()->getSubtarget<AArch64Subtarget>(); 7275 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 7276 } 7277 7278 std::optional<outliner::OutlinedFunction> 7279 AArch64InstrInfo::getOutliningCandidateInfo( 7280 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 7281 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 7282 unsigned SequenceSize = 7283 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 7284 [this](unsigned Sum, const MachineInstr &MI) { 7285 return Sum + getInstSizeInBytes(MI); 7286 }); 7287 unsigned NumBytesToCreateFrame = 0; 7288 7289 // We only allow outlining for functions having exactly matching return 7290 // address signing attributes, i.e., all share the same value for the 7291 // attribute "sign-return-address" and all share the same type of key they 7292 // are signed with. 7293 // Additionally we require all functions to simultaniously either support 7294 // v8.3a features or not. Otherwise an outlined function could get signed 7295 // using dedicated v8.3 instructions and a call from a function that doesn't 7296 // support v8.3 instructions would therefore be invalid. 7297 if (std::adjacent_find( 7298 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 7299 [](const outliner::Candidate &a, const outliner::Candidate &b) { 7300 // Return true if a and b are non-equal w.r.t. return address 7301 // signing or support of v8.3a features 7302 if (outliningCandidatesSigningScopeConsensus(a, b) && 7303 outliningCandidatesSigningKeyConsensus(a, b) && 7304 outliningCandidatesV8_3OpsConsensus(a, b)) { 7305 return false; 7306 } 7307 return true; 7308 }) != RepeatedSequenceLocs.end()) { 7309 return std::nullopt; 7310 } 7311 7312 // Since at this point all candidates agree on their return address signing 7313 // picking just one is fine. If the candidate functions potentially sign their 7314 // return addresses, the outlined function should do the same. Note that in 7315 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 7316 // not certainly true that the outlined function will have to sign its return 7317 // address but this decision is made later, when the decision to outline 7318 // has already been made. 7319 // The same holds for the number of additional instructions we need: On 7320 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 7321 // necessary. However, at this point we don't know if the outlined function 7322 // will have a RET instruction so we assume the worst. 7323 const TargetRegisterInfo &TRI = getRegisterInfo(); 7324 if (FirstCand.getMF() 7325 ->getInfo<AArch64FunctionInfo>() 7326 ->shouldSignReturnAddress(true)) { 7327 // One PAC and one AUT instructions 7328 NumBytesToCreateFrame += 8; 7329 7330 // We have to check if sp modifying instructions would get outlined. 7331 // If so we only allow outlining if sp is unchanged overall, so matching 7332 // sub and add instructions are okay to outline, all other sp modifications 7333 // are not 7334 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 7335 int SPValue = 0; 7336 MachineBasicBlock::iterator MBBI = C.front(); 7337 for (;;) { 7338 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 7339 switch (MBBI->getOpcode()) { 7340 case AArch64::ADDXri: 7341 case AArch64::ADDWri: 7342 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7343 assert(MBBI->getOperand(2).isImm() && 7344 "Expected operand to be immediate"); 7345 assert(MBBI->getOperand(1).isReg() && 7346 "Expected operand to be a register"); 7347 // Check if the add just increments sp. If so, we search for 7348 // matching sub instructions that decrement sp. If not, the 7349 // modification is illegal 7350 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7351 SPValue += MBBI->getOperand(2).getImm(); 7352 else 7353 return true; 7354 break; 7355 case AArch64::SUBXri: 7356 case AArch64::SUBWri: 7357 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7358 assert(MBBI->getOperand(2).isImm() && 7359 "Expected operand to be immediate"); 7360 assert(MBBI->getOperand(1).isReg() && 7361 "Expected operand to be a register"); 7362 // Check if the sub just decrements sp. If so, we search for 7363 // matching add instructions that increment sp. If not, the 7364 // modification is illegal 7365 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7366 SPValue -= MBBI->getOperand(2).getImm(); 7367 else 7368 return true; 7369 break; 7370 default: 7371 return true; 7372 } 7373 } 7374 if (MBBI == C.back()) 7375 break; 7376 ++MBBI; 7377 } 7378 if (SPValue) 7379 return true; 7380 return false; 7381 }; 7382 // Remove candidates with illegal stack modifying instructions 7383 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 7384 7385 // If the sequence doesn't have enough candidates left, then we're done. 7386 if (RepeatedSequenceLocs.size() < 2) 7387 return std::nullopt; 7388 } 7389 7390 // Properties about candidate MBBs that hold for all of them. 7391 unsigned FlagsSetInAll = 0xF; 7392 7393 // Compute liveness information for each candidate, and set FlagsSetInAll. 7394 for (outliner::Candidate &C : RepeatedSequenceLocs) 7395 FlagsSetInAll &= C.Flags; 7396 7397 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 7398 7399 // Helper lambda which sets call information for every candidate. 7400 auto SetCandidateCallInfo = 7401 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 7402 for (outliner::Candidate &C : RepeatedSequenceLocs) 7403 C.setCallInfo(CallID, NumBytesForCall); 7404 }; 7405 7406 unsigned FrameID = MachineOutlinerDefault; 7407 NumBytesToCreateFrame += 4; 7408 7409 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 7410 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 7411 }); 7412 7413 // We check to see if CFI Instructions are present, and if they are 7414 // we find the number of CFI Instructions in the candidates. 7415 unsigned CFICount = 0; 7416 for (auto &I : make_range(RepeatedSequenceLocs[0].front(), 7417 std::next(RepeatedSequenceLocs[0].back()))) { 7418 if (I.isCFIInstruction()) 7419 CFICount++; 7420 } 7421 7422 // We compare the number of found CFI Instructions to the number of CFI 7423 // instructions in the parent function for each candidate. We must check this 7424 // since if we outline one of the CFI instructions in a function, we have to 7425 // outline them all for correctness. If we do not, the address offsets will be 7426 // incorrect between the two sections of the program. 7427 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7428 std::vector<MCCFIInstruction> CFIInstructions = 7429 C.getMF()->getFrameInstructions(); 7430 7431 if (CFICount > 0 && CFICount != CFIInstructions.size()) 7432 return std::nullopt; 7433 } 7434 7435 // Returns true if an instructions is safe to fix up, false otherwise. 7436 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 7437 if (MI.isCall()) 7438 return true; 7439 7440 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 7441 !MI.readsRegister(AArch64::SP, &TRI)) 7442 return true; 7443 7444 // Any modification of SP will break our code to save/restore LR. 7445 // FIXME: We could handle some instructions which add a constant 7446 // offset to SP, with a bit more work. 7447 if (MI.modifiesRegister(AArch64::SP, &TRI)) 7448 return false; 7449 7450 // At this point, we have a stack instruction that we might need to 7451 // fix up. We'll handle it if it's a load or store. 7452 if (MI.mayLoadOrStore()) { 7453 const MachineOperand *Base; // Filled with the base operand of MI. 7454 int64_t Offset; // Filled with the offset of MI. 7455 bool OffsetIsScalable; 7456 7457 // Does it allow us to offset the base operand and is the base the 7458 // register SP? 7459 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 7460 !Base->isReg() || Base->getReg() != AArch64::SP) 7461 return false; 7462 7463 // Fixe-up code below assumes bytes. 7464 if (OffsetIsScalable) 7465 return false; 7466 7467 // Find the minimum/maximum offset for this instruction and check 7468 // if fixing it up would be in range. 7469 int64_t MinOffset, 7470 MaxOffset; // Unscaled offsets for the instruction. 7471 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 7472 unsigned DummyWidth; 7473 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 7474 7475 Offset += 16; // Update the offset to what it would be if we outlined. 7476 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 7477 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 7478 return false; 7479 7480 // It's in range, so we can outline it. 7481 return true; 7482 } 7483 7484 // FIXME: Add handling for instructions like "add x0, sp, #8". 7485 7486 // We can't fix it up, so don't outline it. 7487 return false; 7488 }; 7489 7490 // True if it's possible to fix up each stack instruction in this sequence. 7491 // Important for frames/call variants that modify the stack. 7492 bool AllStackInstrsSafe = std::all_of( 7493 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 7494 7495 // If the last instruction in any candidate is a terminator, then we should 7496 // tail call all of the candidates. 7497 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 7498 FrameID = MachineOutlinerTailCall; 7499 NumBytesToCreateFrame = 0; 7500 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 7501 } 7502 7503 else if (LastInstrOpcode == AArch64::BL || 7504 ((LastInstrOpcode == AArch64::BLR || 7505 LastInstrOpcode == AArch64::BLRNoIP) && 7506 !HasBTI)) { 7507 // FIXME: Do we need to check if the code after this uses the value of LR? 7508 FrameID = MachineOutlinerThunk; 7509 NumBytesToCreateFrame = 0; 7510 SetCandidateCallInfo(MachineOutlinerThunk, 4); 7511 } 7512 7513 else { 7514 // We need to decide how to emit calls + frames. We can always emit the same 7515 // frame if we don't need to save to the stack. If we have to save to the 7516 // stack, then we need a different frame. 7517 unsigned NumBytesNoStackCalls = 0; 7518 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 7519 7520 // Check if we have to save LR. 7521 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7522 bool LRAvailable = 7523 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 7524 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 7525 : true; 7526 // If we have a noreturn caller, then we're going to be conservative and 7527 // say that we have to save LR. If we don't have a ret at the end of the 7528 // block, then we can't reason about liveness accurately. 7529 // 7530 // FIXME: We can probably do better than always disabling this in 7531 // noreturn functions by fixing up the liveness info. 7532 bool IsNoReturn = 7533 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 7534 7535 // Is LR available? If so, we don't need a save. 7536 if (LRAvailable && !IsNoReturn) { 7537 NumBytesNoStackCalls += 4; 7538 C.setCallInfo(MachineOutlinerNoLRSave, 4); 7539 CandidatesWithoutStackFixups.push_back(C); 7540 } 7541 7542 // Is an unused register available? If so, we won't modify the stack, so 7543 // we can outline with the same frame type as those that don't save LR. 7544 else if (findRegisterToSaveLRTo(C)) { 7545 NumBytesNoStackCalls += 12; 7546 C.setCallInfo(MachineOutlinerRegSave, 12); 7547 CandidatesWithoutStackFixups.push_back(C); 7548 } 7549 7550 // Is SP used in the sequence at all? If not, we don't have to modify 7551 // the stack, so we are guaranteed to get the same frame. 7552 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 7553 NumBytesNoStackCalls += 12; 7554 C.setCallInfo(MachineOutlinerDefault, 12); 7555 CandidatesWithoutStackFixups.push_back(C); 7556 } 7557 7558 // If we outline this, we need to modify the stack. Pretend we don't 7559 // outline this by saving all of its bytes. 7560 else { 7561 NumBytesNoStackCalls += SequenceSize; 7562 } 7563 } 7564 7565 // If there are no places where we have to save LR, then note that we 7566 // don't have to update the stack. Otherwise, give every candidate the 7567 // default call type, as long as it's safe to do so. 7568 if (!AllStackInstrsSafe || 7569 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 7570 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 7571 FrameID = MachineOutlinerNoLRSave; 7572 } else { 7573 SetCandidateCallInfo(MachineOutlinerDefault, 12); 7574 7575 // Bugzilla ID: 46767 7576 // TODO: Check if fixing up the stack more than once is safe so we can 7577 // outline these. 7578 // 7579 // An outline resulting in a caller that requires stack fixups at the 7580 // callsite to a callee that also requires stack fixups can happen when 7581 // there are no available registers at the candidate callsite for a 7582 // candidate that itself also has calls. 7583 // 7584 // In other words if function_containing_sequence in the following pseudo 7585 // assembly requires that we save LR at the point of the call, but there 7586 // are no available registers: in this case we save using SP and as a 7587 // result the SP offsets requires stack fixups by multiples of 16. 7588 // 7589 // function_containing_sequence: 7590 // ... 7591 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7592 // call OUTLINED_FUNCTION_N 7593 // restore LR from SP 7594 // ... 7595 // 7596 // OUTLINED_FUNCTION_N: 7597 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7598 // ... 7599 // bl foo 7600 // restore LR from SP 7601 // ret 7602 // 7603 // Because the code to handle more than one stack fixup does not 7604 // currently have the proper checks for legality, these cases will assert 7605 // in the AArch64 MachineOutliner. This is because the code to do this 7606 // needs more hardening, testing, better checks that generated code is 7607 // legal, etc and because it is only verified to handle a single pass of 7608 // stack fixup. 7609 // 7610 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 7611 // these cases until they are known to be handled. Bugzilla 46767 is 7612 // referenced in comments at the assert site. 7613 // 7614 // To avoid asserting (or generating non-legal code on noassert builds) 7615 // we remove all candidates which would need more than one stack fixup by 7616 // pruning the cases where the candidate has calls while also having no 7617 // available LR and having no available general purpose registers to copy 7618 // LR to (ie one extra stack save/restore). 7619 // 7620 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7621 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 7622 return (std::any_of( 7623 C.front(), std::next(C.back()), 7624 [](const MachineInstr &MI) { return MI.isCall(); })) && 7625 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 7626 !findRegisterToSaveLRTo(C)); 7627 }); 7628 } 7629 } 7630 7631 // If we dropped all of the candidates, bail out here. 7632 if (RepeatedSequenceLocs.size() < 2) { 7633 RepeatedSequenceLocs.clear(); 7634 return std::nullopt; 7635 } 7636 } 7637 7638 // Does every candidate's MBB contain a call? If so, then we might have a call 7639 // in the range. 7640 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7641 // Check if the range contains a call. These require a save + restore of the 7642 // link register. 7643 bool ModStackToSaveLR = false; 7644 if (std::any_of(FirstCand.front(), FirstCand.back(), 7645 [](const MachineInstr &MI) { return MI.isCall(); })) 7646 ModStackToSaveLR = true; 7647 7648 // Handle the last instruction separately. If this is a tail call, then the 7649 // last instruction is a call. We don't want to save + restore in this case. 7650 // However, it could be possible that the last instruction is a call without 7651 // it being valid to tail call this sequence. We should consider this as 7652 // well. 7653 else if (FrameID != MachineOutlinerThunk && 7654 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 7655 ModStackToSaveLR = true; 7656 7657 if (ModStackToSaveLR) { 7658 // We can't fix up the stack. Bail out. 7659 if (!AllStackInstrsSafe) { 7660 RepeatedSequenceLocs.clear(); 7661 return std::nullopt; 7662 } 7663 7664 // Save + restore LR. 7665 NumBytesToCreateFrame += 8; 7666 } 7667 } 7668 7669 // If we have CFI instructions, we can only outline if the outlined section 7670 // can be a tail call 7671 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 7672 return std::nullopt; 7673 7674 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 7675 NumBytesToCreateFrame, FrameID); 7676 } 7677 7678 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 7679 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 7680 const Function &F = MF.getFunction(); 7681 7682 // Can F be deduplicated by the linker? If it can, don't outline from it. 7683 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 7684 return false; 7685 7686 // Don't outline from functions with section markings; the program could 7687 // expect that all the code is in the named section. 7688 // FIXME: Allow outlining from multiple functions with the same section 7689 // marking. 7690 if (F.hasSection()) 7691 return false; 7692 7693 // Outlining from functions with redzones is unsafe since the outliner may 7694 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 7695 // outline from it. 7696 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 7697 if (!AFI || AFI->hasRedZone().value_or(true)) 7698 return false; 7699 7700 // FIXME: Teach the outliner to generate/handle Windows unwind info. 7701 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 7702 return false; 7703 7704 // It's safe to outline from MF. 7705 return true; 7706 } 7707 7708 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 7709 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 7710 unsigned &Flags) const { 7711 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 7712 "Must track liveness!"); 7713 SmallVector< 7714 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 7715 Ranges; 7716 // According to the AArch64 Procedure Call Standard, the following are 7717 // undefined on entry/exit from a function call: 7718 // 7719 // * Registers x16, x17, (and thus w16, w17) 7720 // * Condition codes (and thus the NZCV register) 7721 // 7722 // If any of these registers are used inside or live across an outlined 7723 // function, then they may be modified later, either by the compiler or 7724 // some other tool (like the linker). 7725 // 7726 // To avoid outlining in these situations, partition each block into ranges 7727 // where these registers are dead. We will only outline from those ranges. 7728 LiveRegUnits LRU(getRegisterInfo()); 7729 auto AreAllUnsafeRegsDead = [&LRU]() { 7730 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 7731 LRU.available(AArch64::NZCV); 7732 }; 7733 7734 // We need to know if LR is live across an outlining boundary later on in 7735 // order to decide how we'll create the outlined call, frame, etc. 7736 // 7737 // It's pretty expensive to check this for *every candidate* within a block. 7738 // That's some potentially n^2 behaviour, since in the worst case, we'd need 7739 // to compute liveness from the end of the block for O(n) candidates within 7740 // the block. 7741 // 7742 // So, to improve the average case, let's keep track of liveness from the end 7743 // of the block to the beginning of *every outlinable range*. If we know that 7744 // LR is available in every range we could outline from, then we know that 7745 // we don't need to check liveness for any candidate within that range. 7746 bool LRAvailableEverywhere = true; 7747 // Compute liveness bottom-up. 7748 LRU.addLiveOuts(MBB); 7749 // Update flags that require info about the entire MBB. 7750 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 7751 if (MI.isCall() && !MI.isTerminator()) 7752 Flags |= MachineOutlinerMBBFlags::HasCalls; 7753 }; 7754 // Range: [RangeBegin, RangeEnd) 7755 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 7756 unsigned RangeLen; 7757 auto CreateNewRangeStartingAt = 7758 [&RangeBegin, &RangeEnd, 7759 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 7760 RangeBegin = NewBegin; 7761 RangeEnd = std::next(RangeBegin); 7762 RangeLen = 0; 7763 }; 7764 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 7765 // At least one unsafe register is not dead. We do not want to outline at 7766 // this point. If it is long enough to outline from, save the range 7767 // [RangeBegin, RangeEnd). 7768 if (RangeLen > 1) 7769 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 7770 }; 7771 // Find the first point where all unsafe registers are dead. 7772 // FIND: <safe instr> <-- end of first potential range 7773 // SKIP: <unsafe def> 7774 // SKIP: ... everything between ... 7775 // SKIP: <unsafe use> 7776 auto FirstPossibleEndPt = MBB.instr_rbegin(); 7777 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 7778 LRU.stepBackward(*FirstPossibleEndPt); 7779 // Update flags that impact how we outline across the entire block, 7780 // regardless of safety. 7781 UpdateWholeMBBFlags(*FirstPossibleEndPt); 7782 if (AreAllUnsafeRegsDead()) 7783 break; 7784 } 7785 // If we exhausted the entire block, we have no safe ranges to outline. 7786 if (FirstPossibleEndPt == MBB.instr_rend()) 7787 return Ranges; 7788 // Current range. 7789 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 7790 // StartPt points to the first place where all unsafe registers 7791 // are dead (if there is any such point). Begin partitioning the MBB into 7792 // ranges. 7793 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 7794 LRU.stepBackward(MI); 7795 UpdateWholeMBBFlags(MI); 7796 if (!AreAllUnsafeRegsDead()) { 7797 SaveRangeIfNonEmpty(); 7798 CreateNewRangeStartingAt(MI.getIterator()); 7799 continue; 7800 } 7801 LRAvailableEverywhere &= LRU.available(AArch64::LR); 7802 RangeBegin = MI.getIterator(); 7803 ++RangeLen; 7804 } 7805 // Above loop misses the last (or only) range. If we are still safe, then 7806 // let's save the range. 7807 if (AreAllUnsafeRegsDead()) 7808 SaveRangeIfNonEmpty(); 7809 if (Ranges.empty()) 7810 return Ranges; 7811 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 7812 // the order. 7813 std::reverse(Ranges.begin(), Ranges.end()); 7814 // If there is at least one outlinable range where LR is unavailable 7815 // somewhere, remember that. 7816 if (!LRAvailableEverywhere) 7817 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 7818 return Ranges; 7819 } 7820 7821 outliner::InstrType 7822 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 7823 unsigned Flags) const { 7824 MachineInstr &MI = *MIT; 7825 MachineBasicBlock *MBB = MI.getParent(); 7826 MachineFunction *MF = MBB->getParent(); 7827 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 7828 7829 // Don't outline anything used for return address signing. The outlined 7830 // function will get signed later if needed 7831 switch (MI.getOpcode()) { 7832 case AArch64::PACIASP: 7833 case AArch64::PACIBSP: 7834 case AArch64::AUTIASP: 7835 case AArch64::AUTIBSP: 7836 case AArch64::RETAA: 7837 case AArch64::RETAB: 7838 case AArch64::EMITBKEY: 7839 return outliner::InstrType::Illegal; 7840 } 7841 7842 // Don't outline LOHs. 7843 if (FuncInfo->getLOHRelated().count(&MI)) 7844 return outliner::InstrType::Illegal; 7845 7846 // We can only outline these if we will tail call the outlined function, or 7847 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 7848 // in a tail call. 7849 // 7850 // FIXME: If the proper fixups for the offset are implemented, this should be 7851 // possible. 7852 if (MI.isCFIInstruction()) 7853 return outliner::InstrType::Legal; 7854 7855 // Is this a terminator for a basic block? 7856 if (MI.isTerminator()) 7857 // TargetInstrInfo::getOutliningType has already filtered out anything 7858 // that would break this, so we can allow it here. 7859 return outliner::InstrType::Legal; 7860 7861 // Make sure none of the operands are un-outlinable. 7862 for (const MachineOperand &MOP : MI.operands()) { 7863 // A check preventing CFI indices was here before, but only CFI 7864 // instructions should have those. 7865 assert(!MOP.isCFIIndex()); 7866 7867 // If it uses LR or W30 explicitly, then don't touch it. 7868 if (MOP.isReg() && !MOP.isImplicit() && 7869 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 7870 return outliner::InstrType::Illegal; 7871 } 7872 7873 // Special cases for instructions that can always be outlined, but will fail 7874 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 7875 // be outlined because they don't require a *specific* value to be in LR. 7876 if (MI.getOpcode() == AArch64::ADRP) 7877 return outliner::InstrType::Legal; 7878 7879 // If MI is a call we might be able to outline it. We don't want to outline 7880 // any calls that rely on the position of items on the stack. When we outline 7881 // something containing a call, we have to emit a save and restore of LR in 7882 // the outlined function. Currently, this always happens by saving LR to the 7883 // stack. Thus, if we outline, say, half the parameters for a function call 7884 // plus the call, then we'll break the callee's expectations for the layout 7885 // of the stack. 7886 // 7887 // FIXME: Allow calls to functions which construct a stack frame, as long 7888 // as they don't access arguments on the stack. 7889 // FIXME: Figure out some way to analyze functions defined in other modules. 7890 // We should be able to compute the memory usage based on the IR calling 7891 // convention, even if we can't see the definition. 7892 if (MI.isCall()) { 7893 // Get the function associated with the call. Look at each operand and find 7894 // the one that represents the callee and get its name. 7895 const Function *Callee = nullptr; 7896 for (const MachineOperand &MOP : MI.operands()) { 7897 if (MOP.isGlobal()) { 7898 Callee = dyn_cast<Function>(MOP.getGlobal()); 7899 break; 7900 } 7901 } 7902 7903 // Never outline calls to mcount. There isn't any rule that would require 7904 // this, but the Linux kernel's "ftrace" feature depends on it. 7905 if (Callee && Callee->getName() == "\01_mcount") 7906 return outliner::InstrType::Illegal; 7907 7908 // If we don't know anything about the callee, assume it depends on the 7909 // stack layout of the caller. In that case, it's only legal to outline 7910 // as a tail-call. Explicitly list the call instructions we know about so we 7911 // don't get unexpected results with call pseudo-instructions. 7912 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7913 if (MI.getOpcode() == AArch64::BLR || 7914 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7915 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7916 7917 if (!Callee) 7918 return UnknownCallOutlineType; 7919 7920 // We have a function we have information about. Check it if it's something 7921 // can safely outline. 7922 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7923 7924 // We don't know what's going on with the callee at all. Don't touch it. 7925 if (!CalleeMF) 7926 return UnknownCallOutlineType; 7927 7928 // Check if we know anything about the callee saves on the function. If we 7929 // don't, then don't touch it, since that implies that we haven't 7930 // computed anything about its stack frame yet. 7931 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7932 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7933 MFI.getNumObjects() > 0) 7934 return UnknownCallOutlineType; 7935 7936 // At this point, we can say that CalleeMF ought to not pass anything on the 7937 // stack. Therefore, we can outline it. 7938 return outliner::InstrType::Legal; 7939 } 7940 7941 // Don't touch the link register or W30. 7942 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7943 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7944 return outliner::InstrType::Illegal; 7945 7946 // Don't outline BTI instructions, because that will prevent the outlining 7947 // site from being indirectly callable. 7948 if (MI.getOpcode() == AArch64::HINT) { 7949 int64_t Imm = MI.getOperand(0).getImm(); 7950 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7951 return outliner::InstrType::Illegal; 7952 } 7953 7954 return outliner::InstrType::Legal; 7955 } 7956 7957 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7958 for (MachineInstr &MI : MBB) { 7959 const MachineOperand *Base; 7960 unsigned Width; 7961 int64_t Offset; 7962 bool OffsetIsScalable; 7963 7964 // Is this a load or store with an immediate offset with SP as the base? 7965 if (!MI.mayLoadOrStore() || 7966 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7967 &RI) || 7968 (Base->isReg() && Base->getReg() != AArch64::SP)) 7969 continue; 7970 7971 // It is, so we have to fix it up. 7972 TypeSize Scale(0U, false); 7973 int64_t Dummy1, Dummy2; 7974 7975 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7976 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7977 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7978 assert(Scale != 0 && "Unexpected opcode!"); 7979 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7980 7981 // We've pushed the return address to the stack, so add 16 to the offset. 7982 // This is safe, since we already checked if it would overflow when we 7983 // checked if this instruction was legal to outline. 7984 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 7985 StackOffsetOperand.setImm(NewImm); 7986 } 7987 } 7988 7989 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7990 bool ShouldSignReturnAddr, 7991 bool ShouldSignReturnAddrWithBKey) { 7992 if (ShouldSignReturnAddr) { 7993 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7994 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7995 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7996 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7997 DebugLoc DL; 7998 7999 if (MBBAUT != MBB.end()) 8000 DL = MBBAUT->getDebugLoc(); 8001 8002 // At the very beginning of the basic block we insert the following 8003 // depending on the key type 8004 // 8005 // a_key: b_key: 8006 // PACIASP EMITBKEY 8007 // CFI_INSTRUCTION PACIBSP 8008 // CFI_INSTRUCTION 8009 if (ShouldSignReturnAddrWithBKey) { 8010 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 8011 .setMIFlag(MachineInstr::FrameSetup); 8012 } 8013 8014 BuildMI(MBB, MBBPAC, DebugLoc(), 8015 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::PACIBSP 8016 : AArch64::PACIASP)) 8017 .setMIFlag(MachineInstr::FrameSetup); 8018 8019 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 8020 unsigned CFIIndex = 8021 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 8022 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 8023 .addCFIIndex(CFIIndex) 8024 .setMIFlags(MachineInstr::FrameSetup); 8025 } 8026 8027 // If v8.3a features are available we can replace a RET instruction by 8028 // RETAA or RETAB and omit the AUT instructions. In this case the 8029 // DW_CFA_AARCH64_negate_ra_state can't be emitted. 8030 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 8031 MBBAUT->getOpcode() == AArch64::RET) { 8032 BuildMI(MBB, MBBAUT, DL, 8033 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::RETAB 8034 : AArch64::RETAA)) 8035 .copyImplicitOps(*MBBAUT); 8036 MBB.erase(MBBAUT); 8037 } else { 8038 BuildMI(MBB, MBBAUT, DL, 8039 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::AUTIBSP 8040 : AArch64::AUTIASP)) 8041 .setMIFlag(MachineInstr::FrameDestroy); 8042 unsigned CFIIndexAuth = 8043 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 8044 BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 8045 .addCFIIndex(CFIIndexAuth) 8046 .setMIFlags(MachineInstr::FrameDestroy); 8047 } 8048 } 8049 } 8050 8051 void AArch64InstrInfo::buildOutlinedFrame( 8052 MachineBasicBlock &MBB, MachineFunction &MF, 8053 const outliner::OutlinedFunction &OF) const { 8054 8055 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 8056 8057 if (OF.FrameConstructionID == MachineOutlinerTailCall) 8058 FI->setOutliningStyle("Tail Call"); 8059 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 8060 // For thunk outlining, rewrite the last instruction from a call to a 8061 // tail-call. 8062 MachineInstr *Call = &*--MBB.instr_end(); 8063 unsigned TailOpcode; 8064 if (Call->getOpcode() == AArch64::BL) { 8065 TailOpcode = AArch64::TCRETURNdi; 8066 } else { 8067 assert(Call->getOpcode() == AArch64::BLR || 8068 Call->getOpcode() == AArch64::BLRNoIP); 8069 TailOpcode = AArch64::TCRETURNriALL; 8070 } 8071 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 8072 .add(Call->getOperand(0)) 8073 .addImm(0); 8074 MBB.insert(MBB.end(), TC); 8075 Call->eraseFromParent(); 8076 8077 FI->setOutliningStyle("Thunk"); 8078 } 8079 8080 bool IsLeafFunction = true; 8081 8082 // Is there a call in the outlined range? 8083 auto IsNonTailCall = [](const MachineInstr &MI) { 8084 return MI.isCall() && !MI.isReturn(); 8085 }; 8086 8087 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 8088 // Fix up the instructions in the range, since we're going to modify the 8089 // stack. 8090 8091 // Bugzilla ID: 46767 8092 // TODO: Check if fixing up twice is safe so we can outline these. 8093 assert(OF.FrameConstructionID != MachineOutlinerDefault && 8094 "Can only fix up stack references once"); 8095 fixupPostOutline(MBB); 8096 8097 IsLeafFunction = false; 8098 8099 // LR has to be a live in so that we can save it. 8100 if (!MBB.isLiveIn(AArch64::LR)) 8101 MBB.addLiveIn(AArch64::LR); 8102 8103 MachineBasicBlock::iterator It = MBB.begin(); 8104 MachineBasicBlock::iterator Et = MBB.end(); 8105 8106 if (OF.FrameConstructionID == MachineOutlinerTailCall || 8107 OF.FrameConstructionID == MachineOutlinerThunk) 8108 Et = std::prev(MBB.end()); 8109 8110 // Insert a save before the outlined region 8111 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 8112 .addReg(AArch64::SP, RegState::Define) 8113 .addReg(AArch64::LR) 8114 .addReg(AArch64::SP) 8115 .addImm(-16); 8116 It = MBB.insert(It, STRXpre); 8117 8118 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 8119 const TargetSubtargetInfo &STI = MF.getSubtarget(); 8120 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 8121 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 8122 8123 // Add a CFI saying the stack was moved 16 B down. 8124 int64_t StackPosEntry = 8125 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 8126 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8127 .addCFIIndex(StackPosEntry) 8128 .setMIFlags(MachineInstr::FrameSetup); 8129 8130 // Add a CFI saying that the LR that we want to find is now 16 B higher 8131 // than before. 8132 int64_t LRPosEntry = MF.addFrameInst( 8133 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 8134 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8135 .addCFIIndex(LRPosEntry) 8136 .setMIFlags(MachineInstr::FrameSetup); 8137 } 8138 8139 // Insert a restore before the terminator for the function. 8140 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8141 .addReg(AArch64::SP, RegState::Define) 8142 .addReg(AArch64::LR, RegState::Define) 8143 .addReg(AArch64::SP) 8144 .addImm(16); 8145 Et = MBB.insert(Et, LDRXpost); 8146 } 8147 8148 // If a bunch of candidates reach this point they must agree on their return 8149 // address signing. It is therefore enough to just consider the signing 8150 // behaviour of one of them 8151 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 8152 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 8153 8154 // a_key is the default 8155 bool ShouldSignReturnAddrWithBKey = MFI.shouldSignWithBKey(); 8156 8157 // If this is a tail call outlined function, then there's already a return. 8158 if (OF.FrameConstructionID == MachineOutlinerTailCall || 8159 OF.FrameConstructionID == MachineOutlinerThunk) { 8160 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8161 ShouldSignReturnAddrWithBKey); 8162 return; 8163 } 8164 8165 // It's not a tail call, so we have to insert the return ourselves. 8166 8167 // LR has to be a live in so that we can return to it. 8168 if (!MBB.isLiveIn(AArch64::LR)) 8169 MBB.addLiveIn(AArch64::LR); 8170 8171 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 8172 .addReg(AArch64::LR); 8173 MBB.insert(MBB.end(), ret); 8174 8175 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8176 ShouldSignReturnAddrWithBKey); 8177 8178 FI->setOutliningStyle("Function"); 8179 8180 // Did we have to modify the stack by saving the link register? 8181 if (OF.FrameConstructionID != MachineOutlinerDefault) 8182 return; 8183 8184 // We modified the stack. 8185 // Walk over the basic block and fix up all the stack accesses. 8186 fixupPostOutline(MBB); 8187 } 8188 8189 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 8190 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 8191 MachineFunction &MF, outliner::Candidate &C) const { 8192 8193 // Are we tail calling? 8194 if (C.CallConstructionID == MachineOutlinerTailCall) { 8195 // If yes, then we can just branch to the label. 8196 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 8197 .addGlobalAddress(M.getNamedValue(MF.getName())) 8198 .addImm(0)); 8199 return It; 8200 } 8201 8202 // Are we saving the link register? 8203 if (C.CallConstructionID == MachineOutlinerNoLRSave || 8204 C.CallConstructionID == MachineOutlinerThunk) { 8205 // No, so just insert the call. 8206 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8207 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8208 return It; 8209 } 8210 8211 // We want to return the spot where we inserted the call. 8212 MachineBasicBlock::iterator CallPt; 8213 8214 // Instructions for saving and restoring LR around the call instruction we're 8215 // going to insert. 8216 MachineInstr *Save; 8217 MachineInstr *Restore; 8218 // Can we save to a register? 8219 if (C.CallConstructionID == MachineOutlinerRegSave) { 8220 // FIXME: This logic should be sunk into a target-specific interface so that 8221 // we don't have to recompute the register. 8222 Register Reg = findRegisterToSaveLRTo(C); 8223 assert(Reg && "No callee-saved register available?"); 8224 8225 // LR has to be a live in so that we can save it. 8226 if (!MBB.isLiveIn(AArch64::LR)) 8227 MBB.addLiveIn(AArch64::LR); 8228 8229 // Save and restore LR from Reg. 8230 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 8231 .addReg(AArch64::XZR) 8232 .addReg(AArch64::LR) 8233 .addImm(0); 8234 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 8235 .addReg(AArch64::XZR) 8236 .addReg(Reg) 8237 .addImm(0); 8238 } else { 8239 // We have the default case. Save and restore from SP. 8240 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 8241 .addReg(AArch64::SP, RegState::Define) 8242 .addReg(AArch64::LR) 8243 .addReg(AArch64::SP) 8244 .addImm(-16); 8245 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8246 .addReg(AArch64::SP, RegState::Define) 8247 .addReg(AArch64::LR, RegState::Define) 8248 .addReg(AArch64::SP) 8249 .addImm(16); 8250 } 8251 8252 It = MBB.insert(It, Save); 8253 It++; 8254 8255 // Insert the call. 8256 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8257 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8258 CallPt = It; 8259 It++; 8260 8261 It = MBB.insert(It, Restore); 8262 return CallPt; 8263 } 8264 8265 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 8266 MachineFunction &MF) const { 8267 return MF.getFunction().hasMinSize(); 8268 } 8269 8270 std::optional<DestSourcePair> 8271 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 8272 8273 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 8274 // and zero immediate operands used as an alias for mov instruction. 8275 if (MI.getOpcode() == AArch64::ORRWrs && 8276 MI.getOperand(1).getReg() == AArch64::WZR && 8277 MI.getOperand(3).getImm() == 0x0) { 8278 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8279 } 8280 8281 if (MI.getOpcode() == AArch64::ORRXrs && 8282 MI.getOperand(1).getReg() == AArch64::XZR && 8283 MI.getOperand(3).getImm() == 0x0) { 8284 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8285 } 8286 8287 return std::nullopt; 8288 } 8289 8290 std::optional<RegImmPair> 8291 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 8292 int Sign = 1; 8293 int64_t Offset = 0; 8294 8295 // TODO: Handle cases where Reg is a super- or sub-register of the 8296 // destination register. 8297 const MachineOperand &Op0 = MI.getOperand(0); 8298 if (!Op0.isReg() || Reg != Op0.getReg()) 8299 return std::nullopt; 8300 8301 switch (MI.getOpcode()) { 8302 default: 8303 return std::nullopt; 8304 case AArch64::SUBWri: 8305 case AArch64::SUBXri: 8306 case AArch64::SUBSWri: 8307 case AArch64::SUBSXri: 8308 Sign *= -1; 8309 [[fallthrough]]; 8310 case AArch64::ADDSWri: 8311 case AArch64::ADDSXri: 8312 case AArch64::ADDWri: 8313 case AArch64::ADDXri: { 8314 // TODO: Third operand can be global address (usually some string). 8315 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 8316 !MI.getOperand(2).isImm()) 8317 return std::nullopt; 8318 int Shift = MI.getOperand(3).getImm(); 8319 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 8320 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 8321 } 8322 } 8323 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 8324 } 8325 8326 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 8327 /// the destination register then, if possible, describe the value in terms of 8328 /// the source register. 8329 static std::optional<ParamLoadedValue> 8330 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 8331 const TargetInstrInfo *TII, 8332 const TargetRegisterInfo *TRI) { 8333 auto DestSrc = TII->isCopyInstr(MI); 8334 if (!DestSrc) 8335 return std::nullopt; 8336 8337 Register DestReg = DestSrc->Destination->getReg(); 8338 Register SrcReg = DestSrc->Source->getReg(); 8339 8340 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 8341 8342 // If the described register is the destination, just return the source. 8343 if (DestReg == DescribedReg) 8344 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8345 8346 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 8347 if (MI.getOpcode() == AArch64::ORRWrs && 8348 TRI->isSuperRegister(DestReg, DescribedReg)) 8349 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8350 8351 // We may need to describe the lower part of a ORRXrs move. 8352 if (MI.getOpcode() == AArch64::ORRXrs && 8353 TRI->isSubRegister(DestReg, DescribedReg)) { 8354 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 8355 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 8356 } 8357 8358 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 8359 "Unhandled ORR[XW]rs copy case"); 8360 8361 return std::nullopt; 8362 } 8363 8364 std::optional<ParamLoadedValue> 8365 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 8366 Register Reg) const { 8367 const MachineFunction *MF = MI.getMF(); 8368 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 8369 switch (MI.getOpcode()) { 8370 case AArch64::MOVZWi: 8371 case AArch64::MOVZXi: { 8372 // MOVZWi may be used for producing zero-extended 32-bit immediates in 8373 // 64-bit parameters, so we need to consider super-registers. 8374 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 8375 return std::nullopt; 8376 8377 if (!MI.getOperand(1).isImm()) 8378 return std::nullopt; 8379 int64_t Immediate = MI.getOperand(1).getImm(); 8380 int Shift = MI.getOperand(2).getImm(); 8381 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 8382 nullptr); 8383 } 8384 case AArch64::ORRWrs: 8385 case AArch64::ORRXrs: 8386 return describeORRLoadedValue(MI, Reg, this, TRI); 8387 } 8388 8389 return TargetInstrInfo::describeLoadedValue(MI, Reg); 8390 } 8391 8392 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 8393 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 8394 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 8395 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 8396 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 8397 8398 // Anyexts are nops. 8399 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 8400 return true; 8401 8402 Register DefReg = ExtMI.getOperand(0).getReg(); 8403 if (!MRI.hasOneNonDBGUse(DefReg)) 8404 return false; 8405 8406 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 8407 // addressing mode. 8408 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 8409 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 8410 } 8411 8412 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 8413 return get(Opc).TSFlags & AArch64::ElementSizeMask; 8414 } 8415 8416 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 8417 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 8418 } 8419 8420 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 8421 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 8422 } 8423 8424 unsigned int 8425 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 8426 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 8427 } 8428 8429 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 8430 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 8431 return AArch64::BLRNoIP; 8432 else 8433 return AArch64::BLR; 8434 } 8435 8436 #define GET_INSTRINFO_HELPERS 8437 #define GET_INSTRMAP_INFO 8438 #include "AArch64GenInstrInfo.inc" 8439