1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineCombinerPattern.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/CodeGen/MachineRegisterInfo.h" 31 #include "llvm/CodeGen/StackMaps.h" 32 #include "llvm/CodeGen/TargetRegisterInfo.h" 33 #include "llvm/CodeGen/TargetSubtargetInfo.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/GlobalValue.h" 37 #include "llvm/MC/MCAsmInfo.h" 38 #include "llvm/MC/MCInst.h" 39 #include "llvm/MC/MCInstBuilder.h" 40 #include "llvm/MC/MCInstrDesc.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CodeGen.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Compiler.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/LEB128.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include "llvm/Target/TargetOptions.h" 50 #include <cassert> 51 #include <cstdint> 52 #include <iterator> 53 #include <utility> 54 55 using namespace llvm; 56 57 #define GET_INSTRINFO_CTOR_DTOR 58 #include "AArch64GenInstrInfo.inc" 59 60 static cl::opt<unsigned> TBZDisplacementBits( 61 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 62 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 63 64 static cl::opt<unsigned> CBZDisplacementBits( 65 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 66 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 67 68 static cl::opt<unsigned> 69 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 70 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 71 72 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 73 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 74 AArch64::CATCHRET), 75 RI(STI.getTargetTriple()), Subtarget(STI) {} 76 77 /// GetInstSize - Return the number of bytes of code the specified 78 /// instruction may be. This returns the maximum number of bytes. 79 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 80 const MachineBasicBlock &MBB = *MI.getParent(); 81 const MachineFunction *MF = MBB.getParent(); 82 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 83 84 { 85 auto Op = MI.getOpcode(); 86 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 87 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 88 } 89 90 // Meta-instructions emit no code. 91 if (MI.isMetaInstruction()) 92 return 0; 93 94 // FIXME: We currently only handle pseudoinstructions that don't get expanded 95 // before the assembly printer. 96 unsigned NumBytes = 0; 97 const MCInstrDesc &Desc = MI.getDesc(); 98 99 // Size should be preferably set in 100 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 101 // Specific cases handle instructions of variable sizes 102 switch (Desc.getOpcode()) { 103 default: 104 if (Desc.getSize()) 105 return Desc.getSize(); 106 107 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 108 // with fixed constant size but not specified in .td file) is a normal 109 // 4-byte insn. 110 NumBytes = 4; 111 break; 112 case TargetOpcode::STACKMAP: 113 // The upper bound for a stackmap intrinsic is the full length of its shadow 114 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 115 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 116 break; 117 case TargetOpcode::PATCHPOINT: 118 // The size of the patchpoint intrinsic is the number of bytes requested 119 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 120 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 121 break; 122 case TargetOpcode::STATEPOINT: 123 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 125 // No patch bytes means a normal call inst is emitted 126 if (NumBytes == 0) 127 NumBytes = 4; 128 break; 129 case TargetOpcode::PATCHABLE_FUNCTION_ENTER: 130 case TargetOpcode::PATCHABLE_FUNCTION_EXIT: 131 // An XRay sled can be 4 bytes of alignment plus a 32-byte block. 132 NumBytes = 36; 133 break; 134 135 case AArch64::SPACE: 136 NumBytes = MI.getOperand(1).getImm(); 137 break; 138 case TargetOpcode::BUNDLE: 139 NumBytes = getInstBundleLength(MI); 140 break; 141 } 142 143 return NumBytes; 144 } 145 146 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 147 unsigned Size = 0; 148 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 149 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 150 while (++I != E && I->isInsideBundle()) { 151 assert(!I->isBundle() && "No nested bundle!"); 152 Size += getInstSizeInBytes(*I); 153 } 154 return Size; 155 } 156 157 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 158 SmallVectorImpl<MachineOperand> &Cond) { 159 // Block ends with fall-through condbranch. 160 switch (LastInst->getOpcode()) { 161 default: 162 llvm_unreachable("Unknown branch instruction?"); 163 case AArch64::Bcc: 164 Target = LastInst->getOperand(1).getMBB(); 165 Cond.push_back(LastInst->getOperand(0)); 166 break; 167 case AArch64::CBZW: 168 case AArch64::CBZX: 169 case AArch64::CBNZW: 170 case AArch64::CBNZX: 171 Target = LastInst->getOperand(1).getMBB(); 172 Cond.push_back(MachineOperand::CreateImm(-1)); 173 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 174 Cond.push_back(LastInst->getOperand(0)); 175 break; 176 case AArch64::TBZW: 177 case AArch64::TBZX: 178 case AArch64::TBNZW: 179 case AArch64::TBNZX: 180 Target = LastInst->getOperand(2).getMBB(); 181 Cond.push_back(MachineOperand::CreateImm(-1)); 182 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 183 Cond.push_back(LastInst->getOperand(0)); 184 Cond.push_back(LastInst->getOperand(1)); 185 } 186 } 187 188 static unsigned getBranchDisplacementBits(unsigned Opc) { 189 switch (Opc) { 190 default: 191 llvm_unreachable("unexpected opcode!"); 192 case AArch64::B: 193 return 64; 194 case AArch64::TBNZW: 195 case AArch64::TBZW: 196 case AArch64::TBNZX: 197 case AArch64::TBZX: 198 return TBZDisplacementBits; 199 case AArch64::CBNZW: 200 case AArch64::CBZW: 201 case AArch64::CBNZX: 202 case AArch64::CBZX: 203 return CBZDisplacementBits; 204 case AArch64::Bcc: 205 return BCCDisplacementBits; 206 } 207 } 208 209 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 210 int64_t BrOffset) const { 211 unsigned Bits = getBranchDisplacementBits(BranchOp); 212 assert(Bits >= 3 && "max branch displacement must be enough to jump" 213 "over conditional branch expansion"); 214 return isIntN(Bits, BrOffset / 4); 215 } 216 217 MachineBasicBlock * 218 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 219 switch (MI.getOpcode()) { 220 default: 221 llvm_unreachable("unexpected opcode!"); 222 case AArch64::B: 223 return MI.getOperand(0).getMBB(); 224 case AArch64::TBZW: 225 case AArch64::TBNZW: 226 case AArch64::TBZX: 227 case AArch64::TBNZX: 228 return MI.getOperand(2).getMBB(); 229 case AArch64::CBZW: 230 case AArch64::CBNZW: 231 case AArch64::CBZX: 232 case AArch64::CBNZX: 233 case AArch64::Bcc: 234 return MI.getOperand(1).getMBB(); 235 } 236 } 237 238 // Branch analysis. 239 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 240 MachineBasicBlock *&TBB, 241 MachineBasicBlock *&FBB, 242 SmallVectorImpl<MachineOperand> &Cond, 243 bool AllowModify) const { 244 // If the block has no terminators, it just falls into the block after it. 245 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 246 if (I == MBB.end()) 247 return false; 248 249 // Skip over SpeculationBarrierEndBB terminators 250 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 251 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 252 --I; 253 } 254 255 if (!isUnpredicatedTerminator(*I)) 256 return false; 257 258 // Get the last instruction in the block. 259 MachineInstr *LastInst = &*I; 260 261 // If there is only one terminator instruction, process it. 262 unsigned LastOpc = LastInst->getOpcode(); 263 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 264 if (isUncondBranchOpcode(LastOpc)) { 265 TBB = LastInst->getOperand(0).getMBB(); 266 return false; 267 } 268 if (isCondBranchOpcode(LastOpc)) { 269 // Block ends with fall-through condbranch. 270 parseCondBranch(LastInst, TBB, Cond); 271 return false; 272 } 273 return true; // Can't handle indirect branch. 274 } 275 276 // Get the instruction before it if it is a terminator. 277 MachineInstr *SecondLastInst = &*I; 278 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 279 280 // If AllowModify is true and the block ends with two or more unconditional 281 // branches, delete all but the first unconditional branch. 282 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 283 while (isUncondBranchOpcode(SecondLastOpc)) { 284 LastInst->eraseFromParent(); 285 LastInst = SecondLastInst; 286 LastOpc = LastInst->getOpcode(); 287 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 288 // Return now the only terminator is an unconditional branch. 289 TBB = LastInst->getOperand(0).getMBB(); 290 return false; 291 } else { 292 SecondLastInst = &*I; 293 SecondLastOpc = SecondLastInst->getOpcode(); 294 } 295 } 296 } 297 298 // If we're allowed to modify and the block ends in a unconditional branch 299 // which could simply fallthrough, remove the branch. (Note: This case only 300 // matters when we can't understand the whole sequence, otherwise it's also 301 // handled by BranchFolding.cpp.) 302 if (AllowModify && isUncondBranchOpcode(LastOpc) && 303 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 304 LastInst->eraseFromParent(); 305 LastInst = SecondLastInst; 306 LastOpc = LastInst->getOpcode(); 307 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 308 assert(!isUncondBranchOpcode(LastOpc) && 309 "unreachable unconditional branches removed above"); 310 311 if (isCondBranchOpcode(LastOpc)) { 312 // Block ends with fall-through condbranch. 313 parseCondBranch(LastInst, TBB, Cond); 314 return false; 315 } 316 return true; // Can't handle indirect branch. 317 } else { 318 SecondLastInst = &*I; 319 SecondLastOpc = SecondLastInst->getOpcode(); 320 } 321 } 322 323 // If there are three terminators, we don't know what sort of block this is. 324 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 325 return true; 326 327 // If the block ends with a B and a Bcc, handle it. 328 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 329 parseCondBranch(SecondLastInst, TBB, Cond); 330 FBB = LastInst->getOperand(0).getMBB(); 331 return false; 332 } 333 334 // If the block ends with two unconditional branches, handle it. The second 335 // one is not executed, so remove it. 336 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 337 TBB = SecondLastInst->getOperand(0).getMBB(); 338 I = LastInst; 339 if (AllowModify) 340 I->eraseFromParent(); 341 return false; 342 } 343 344 // ...likewise if it ends with an indirect branch followed by an unconditional 345 // branch. 346 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 347 I = LastInst; 348 if (AllowModify) 349 I->eraseFromParent(); 350 return true; 351 } 352 353 // Otherwise, can't handle this. 354 return true; 355 } 356 357 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 358 MachineBranchPredicate &MBP, 359 bool AllowModify) const { 360 // For the moment, handle only a block which ends with a cb(n)zx followed by 361 // a fallthrough. Why this? Because it is a common form. 362 // TODO: Should we handle b.cc? 363 364 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 365 if (I == MBB.end()) 366 return true; 367 368 // Skip over SpeculationBarrierEndBB terminators 369 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 370 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 371 --I; 372 } 373 374 if (!isUnpredicatedTerminator(*I)) 375 return true; 376 377 // Get the last instruction in the block. 378 MachineInstr *LastInst = &*I; 379 unsigned LastOpc = LastInst->getOpcode(); 380 if (!isCondBranchOpcode(LastOpc)) 381 return true; 382 383 switch (LastOpc) { 384 default: 385 return true; 386 case AArch64::CBZW: 387 case AArch64::CBZX: 388 case AArch64::CBNZW: 389 case AArch64::CBNZX: 390 break; 391 }; 392 393 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 394 assert(MBP.TrueDest && "expected!"); 395 MBP.FalseDest = MBB.getNextNode(); 396 397 MBP.ConditionDef = nullptr; 398 MBP.SingleUseCondition = false; 399 400 MBP.LHS = LastInst->getOperand(0); 401 MBP.RHS = MachineOperand::CreateImm(0); 402 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 403 : MachineBranchPredicate::PRED_EQ; 404 return false; 405 } 406 407 bool AArch64InstrInfo::reverseBranchCondition( 408 SmallVectorImpl<MachineOperand> &Cond) const { 409 if (Cond[0].getImm() != -1) { 410 // Regular Bcc 411 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 412 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 413 } else { 414 // Folded compare-and-branch 415 switch (Cond[1].getImm()) { 416 default: 417 llvm_unreachable("Unknown conditional branch!"); 418 case AArch64::CBZW: 419 Cond[1].setImm(AArch64::CBNZW); 420 break; 421 case AArch64::CBNZW: 422 Cond[1].setImm(AArch64::CBZW); 423 break; 424 case AArch64::CBZX: 425 Cond[1].setImm(AArch64::CBNZX); 426 break; 427 case AArch64::CBNZX: 428 Cond[1].setImm(AArch64::CBZX); 429 break; 430 case AArch64::TBZW: 431 Cond[1].setImm(AArch64::TBNZW); 432 break; 433 case AArch64::TBNZW: 434 Cond[1].setImm(AArch64::TBZW); 435 break; 436 case AArch64::TBZX: 437 Cond[1].setImm(AArch64::TBNZX); 438 break; 439 case AArch64::TBNZX: 440 Cond[1].setImm(AArch64::TBZX); 441 break; 442 } 443 } 444 445 return false; 446 } 447 448 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 449 int *BytesRemoved) const { 450 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 451 if (I == MBB.end()) 452 return 0; 453 454 if (!isUncondBranchOpcode(I->getOpcode()) && 455 !isCondBranchOpcode(I->getOpcode())) 456 return 0; 457 458 // Remove the branch. 459 I->eraseFromParent(); 460 461 I = MBB.end(); 462 463 if (I == MBB.begin()) { 464 if (BytesRemoved) 465 *BytesRemoved = 4; 466 return 1; 467 } 468 --I; 469 if (!isCondBranchOpcode(I->getOpcode())) { 470 if (BytesRemoved) 471 *BytesRemoved = 4; 472 return 1; 473 } 474 475 // Remove the branch. 476 I->eraseFromParent(); 477 if (BytesRemoved) 478 *BytesRemoved = 8; 479 480 return 2; 481 } 482 483 void AArch64InstrInfo::instantiateCondBranch( 484 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 485 ArrayRef<MachineOperand> Cond) const { 486 if (Cond[0].getImm() != -1) { 487 // Regular Bcc 488 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 489 } else { 490 // Folded compare-and-branch 491 // Note that we use addOperand instead of addReg to keep the flags. 492 const MachineInstrBuilder MIB = 493 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 494 if (Cond.size() > 3) 495 MIB.addImm(Cond[3].getImm()); 496 MIB.addMBB(TBB); 497 } 498 } 499 500 unsigned AArch64InstrInfo::insertBranch( 501 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 502 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 503 // Shouldn't be a fall through. 504 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 505 506 if (!FBB) { 507 if (Cond.empty()) // Unconditional branch? 508 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 509 else 510 instantiateCondBranch(MBB, DL, TBB, Cond); 511 512 if (BytesAdded) 513 *BytesAdded = 4; 514 515 return 1; 516 } 517 518 // Two-way conditional branch. 519 instantiateCondBranch(MBB, DL, TBB, Cond); 520 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 521 522 if (BytesAdded) 523 *BytesAdded = 8; 524 525 return 2; 526 } 527 528 // Find the original register that VReg is copied from. 529 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 530 while (Register::isVirtualRegister(VReg)) { 531 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 532 if (!DefMI->isFullCopy()) 533 return VReg; 534 VReg = DefMI->getOperand(1).getReg(); 535 } 536 return VReg; 537 } 538 539 // Determine if VReg is defined by an instruction that can be folded into a 540 // csel instruction. If so, return the folded opcode, and the replacement 541 // register. 542 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 543 unsigned *NewVReg = nullptr) { 544 VReg = removeCopies(MRI, VReg); 545 if (!Register::isVirtualRegister(VReg)) 546 return 0; 547 548 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 549 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 550 unsigned Opc = 0; 551 unsigned SrcOpNum = 0; 552 switch (DefMI->getOpcode()) { 553 case AArch64::ADDSXri: 554 case AArch64::ADDSWri: 555 // if NZCV is used, do not fold. 556 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 557 return 0; 558 // fall-through to ADDXri and ADDWri. 559 [[fallthrough]]; 560 case AArch64::ADDXri: 561 case AArch64::ADDWri: 562 // add x, 1 -> csinc. 563 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 564 DefMI->getOperand(3).getImm() != 0) 565 return 0; 566 SrcOpNum = 1; 567 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 568 break; 569 570 case AArch64::ORNXrr: 571 case AArch64::ORNWrr: { 572 // not x -> csinv, represented as orn dst, xzr, src. 573 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 574 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 575 return 0; 576 SrcOpNum = 2; 577 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 578 break; 579 } 580 581 case AArch64::SUBSXrr: 582 case AArch64::SUBSWrr: 583 // if NZCV is used, do not fold. 584 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 585 return 0; 586 // fall-through to SUBXrr and SUBWrr. 587 [[fallthrough]]; 588 case AArch64::SUBXrr: 589 case AArch64::SUBWrr: { 590 // neg x -> csneg, represented as sub dst, xzr, src. 591 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 592 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 593 return 0; 594 SrcOpNum = 2; 595 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 596 break; 597 } 598 default: 599 return 0; 600 } 601 assert(Opc && SrcOpNum && "Missing parameters"); 602 603 if (NewVReg) 604 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 605 return Opc; 606 } 607 608 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 609 ArrayRef<MachineOperand> Cond, 610 Register DstReg, Register TrueReg, 611 Register FalseReg, int &CondCycles, 612 int &TrueCycles, 613 int &FalseCycles) const { 614 // Check register classes. 615 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 616 const TargetRegisterClass *RC = 617 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 618 if (!RC) 619 return false; 620 621 // Also need to check the dest regclass, in case we're trying to optimize 622 // something like: 623 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 624 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 625 return false; 626 627 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 628 unsigned ExtraCondLat = Cond.size() != 1; 629 630 // GPRs are handled by csel. 631 // FIXME: Fold in x+1, -x, and ~x when applicable. 632 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 633 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 634 // Single-cycle csel, csinc, csinv, and csneg. 635 CondCycles = 1 + ExtraCondLat; 636 TrueCycles = FalseCycles = 1; 637 if (canFoldIntoCSel(MRI, TrueReg)) 638 TrueCycles = 0; 639 else if (canFoldIntoCSel(MRI, FalseReg)) 640 FalseCycles = 0; 641 return true; 642 } 643 644 // Scalar floating point is handled by fcsel. 645 // FIXME: Form fabs, fmin, and fmax when applicable. 646 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 647 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 648 CondCycles = 5 + ExtraCondLat; 649 TrueCycles = FalseCycles = 2; 650 return true; 651 } 652 653 // Can't do vectors. 654 return false; 655 } 656 657 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 658 MachineBasicBlock::iterator I, 659 const DebugLoc &DL, Register DstReg, 660 ArrayRef<MachineOperand> Cond, 661 Register TrueReg, Register FalseReg) const { 662 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 663 664 // Parse the condition code, see parseCondBranch() above. 665 AArch64CC::CondCode CC; 666 switch (Cond.size()) { 667 default: 668 llvm_unreachable("Unknown condition opcode in Cond"); 669 case 1: // b.cc 670 CC = AArch64CC::CondCode(Cond[0].getImm()); 671 break; 672 case 3: { // cbz/cbnz 673 // We must insert a compare against 0. 674 bool Is64Bit; 675 switch (Cond[1].getImm()) { 676 default: 677 llvm_unreachable("Unknown branch opcode in Cond"); 678 case AArch64::CBZW: 679 Is64Bit = false; 680 CC = AArch64CC::EQ; 681 break; 682 case AArch64::CBZX: 683 Is64Bit = true; 684 CC = AArch64CC::EQ; 685 break; 686 case AArch64::CBNZW: 687 Is64Bit = false; 688 CC = AArch64CC::NE; 689 break; 690 case AArch64::CBNZX: 691 Is64Bit = true; 692 CC = AArch64CC::NE; 693 break; 694 } 695 Register SrcReg = Cond[2].getReg(); 696 if (Is64Bit) { 697 // cmp reg, #0 is actually subs xzr, reg, #0. 698 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 699 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 700 .addReg(SrcReg) 701 .addImm(0) 702 .addImm(0); 703 } else { 704 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 705 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 706 .addReg(SrcReg) 707 .addImm(0) 708 .addImm(0); 709 } 710 break; 711 } 712 case 4: { // tbz/tbnz 713 // We must insert a tst instruction. 714 switch (Cond[1].getImm()) { 715 default: 716 llvm_unreachable("Unknown branch opcode in Cond"); 717 case AArch64::TBZW: 718 case AArch64::TBZX: 719 CC = AArch64CC::EQ; 720 break; 721 case AArch64::TBNZW: 722 case AArch64::TBNZX: 723 CC = AArch64CC::NE; 724 break; 725 } 726 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 727 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 728 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 729 .addReg(Cond[2].getReg()) 730 .addImm( 731 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 732 else 733 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 734 .addReg(Cond[2].getReg()) 735 .addImm( 736 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 737 break; 738 } 739 } 740 741 unsigned Opc = 0; 742 const TargetRegisterClass *RC = nullptr; 743 bool TryFold = false; 744 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 745 RC = &AArch64::GPR64RegClass; 746 Opc = AArch64::CSELXr; 747 TryFold = true; 748 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 749 RC = &AArch64::GPR32RegClass; 750 Opc = AArch64::CSELWr; 751 TryFold = true; 752 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 753 RC = &AArch64::FPR64RegClass; 754 Opc = AArch64::FCSELDrrr; 755 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 756 RC = &AArch64::FPR32RegClass; 757 Opc = AArch64::FCSELSrrr; 758 } 759 assert(RC && "Unsupported regclass"); 760 761 // Try folding simple instructions into the csel. 762 if (TryFold) { 763 unsigned NewVReg = 0; 764 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 765 if (FoldedOpc) { 766 // The folded opcodes csinc, csinc and csneg apply the operation to 767 // FalseReg, so we need to invert the condition. 768 CC = AArch64CC::getInvertedCondCode(CC); 769 TrueReg = FalseReg; 770 } else 771 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 772 773 // Fold the operation. Leave any dead instructions for DCE to clean up. 774 if (FoldedOpc) { 775 FalseReg = NewVReg; 776 Opc = FoldedOpc; 777 // The extends the live range of NewVReg. 778 MRI.clearKillFlags(NewVReg); 779 } 780 } 781 782 // Pull all virtual register into the appropriate class. 783 MRI.constrainRegClass(TrueReg, RC); 784 MRI.constrainRegClass(FalseReg, RC); 785 786 // Insert the csel. 787 BuildMI(MBB, I, DL, get(Opc), DstReg) 788 .addReg(TrueReg) 789 .addReg(FalseReg) 790 .addImm(CC); 791 } 792 793 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 794 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 795 uint64_t Imm = MI.getOperand(1).getImm(); 796 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 797 uint64_t Encoding; 798 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 799 } 800 801 // FIXME: this implementation should be micro-architecture dependent, so a 802 // micro-architecture target hook should be introduced here in future. 803 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 804 if (!Subtarget.hasCustomCheapAsMoveHandling()) 805 return MI.isAsCheapAsAMove(); 806 807 const unsigned Opcode = MI.getOpcode(); 808 809 // Firstly, check cases gated by features. 810 811 if (Subtarget.hasZeroCycleZeroingFP()) { 812 if (Opcode == AArch64::FMOVH0 || 813 Opcode == AArch64::FMOVS0 || 814 Opcode == AArch64::FMOVD0) 815 return true; 816 } 817 818 if (Subtarget.hasZeroCycleZeroingGP()) { 819 if (Opcode == TargetOpcode::COPY && 820 (MI.getOperand(1).getReg() == AArch64::WZR || 821 MI.getOperand(1).getReg() == AArch64::XZR)) 822 return true; 823 } 824 825 // Secondly, check cases specific to sub-targets. 826 827 if (Subtarget.hasExynosCheapAsMoveHandling()) { 828 if (isExynosCheapAsMove(MI)) 829 return true; 830 831 return MI.isAsCheapAsAMove(); 832 } 833 834 // Finally, check generic cases. 835 836 switch (Opcode) { 837 default: 838 return false; 839 840 // add/sub on register without shift 841 case AArch64::ADDWri: 842 case AArch64::ADDXri: 843 case AArch64::SUBWri: 844 case AArch64::SUBXri: 845 return (MI.getOperand(3).getImm() == 0); 846 847 // logical ops on immediate 848 case AArch64::ANDWri: 849 case AArch64::ANDXri: 850 case AArch64::EORWri: 851 case AArch64::EORXri: 852 case AArch64::ORRWri: 853 case AArch64::ORRXri: 854 return true; 855 856 // logical ops on register without shift 857 case AArch64::ANDWrr: 858 case AArch64::ANDXrr: 859 case AArch64::BICWrr: 860 case AArch64::BICXrr: 861 case AArch64::EONWrr: 862 case AArch64::EONXrr: 863 case AArch64::EORWrr: 864 case AArch64::EORXrr: 865 case AArch64::ORNWrr: 866 case AArch64::ORNXrr: 867 case AArch64::ORRWrr: 868 case AArch64::ORRXrr: 869 return true; 870 871 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 872 // ORRXri, it is as cheap as MOV 873 case AArch64::MOVi32imm: 874 return canBeExpandedToORR(MI, 32); 875 case AArch64::MOVi64imm: 876 return canBeExpandedToORR(MI, 64); 877 } 878 879 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 880 } 881 882 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 883 switch (MI.getOpcode()) { 884 default: 885 return false; 886 887 case AArch64::ADDWrs: 888 case AArch64::ADDXrs: 889 case AArch64::ADDSWrs: 890 case AArch64::ADDSXrs: { 891 unsigned Imm = MI.getOperand(3).getImm(); 892 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 893 if (ShiftVal == 0) 894 return true; 895 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 896 } 897 898 case AArch64::ADDWrx: 899 case AArch64::ADDXrx: 900 case AArch64::ADDXrx64: 901 case AArch64::ADDSWrx: 902 case AArch64::ADDSXrx: 903 case AArch64::ADDSXrx64: { 904 unsigned Imm = MI.getOperand(3).getImm(); 905 switch (AArch64_AM::getArithExtendType(Imm)) { 906 default: 907 return false; 908 case AArch64_AM::UXTB: 909 case AArch64_AM::UXTH: 910 case AArch64_AM::UXTW: 911 case AArch64_AM::UXTX: 912 return AArch64_AM::getArithShiftValue(Imm) <= 4; 913 } 914 } 915 916 case AArch64::SUBWrs: 917 case AArch64::SUBSWrs: { 918 unsigned Imm = MI.getOperand(3).getImm(); 919 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 920 return ShiftVal == 0 || 921 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 922 } 923 924 case AArch64::SUBXrs: 925 case AArch64::SUBSXrs: { 926 unsigned Imm = MI.getOperand(3).getImm(); 927 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 928 return ShiftVal == 0 || 929 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 930 } 931 932 case AArch64::SUBWrx: 933 case AArch64::SUBXrx: 934 case AArch64::SUBXrx64: 935 case AArch64::SUBSWrx: 936 case AArch64::SUBSXrx: 937 case AArch64::SUBSXrx64: { 938 unsigned Imm = MI.getOperand(3).getImm(); 939 switch (AArch64_AM::getArithExtendType(Imm)) { 940 default: 941 return false; 942 case AArch64_AM::UXTB: 943 case AArch64_AM::UXTH: 944 case AArch64_AM::UXTW: 945 case AArch64_AM::UXTX: 946 return AArch64_AM::getArithShiftValue(Imm) == 0; 947 } 948 } 949 950 case AArch64::LDRBBroW: 951 case AArch64::LDRBBroX: 952 case AArch64::LDRBroW: 953 case AArch64::LDRBroX: 954 case AArch64::LDRDroW: 955 case AArch64::LDRDroX: 956 case AArch64::LDRHHroW: 957 case AArch64::LDRHHroX: 958 case AArch64::LDRHroW: 959 case AArch64::LDRHroX: 960 case AArch64::LDRQroW: 961 case AArch64::LDRQroX: 962 case AArch64::LDRSBWroW: 963 case AArch64::LDRSBWroX: 964 case AArch64::LDRSBXroW: 965 case AArch64::LDRSBXroX: 966 case AArch64::LDRSHWroW: 967 case AArch64::LDRSHWroX: 968 case AArch64::LDRSHXroW: 969 case AArch64::LDRSHXroX: 970 case AArch64::LDRSWroW: 971 case AArch64::LDRSWroX: 972 case AArch64::LDRSroW: 973 case AArch64::LDRSroX: 974 case AArch64::LDRWroW: 975 case AArch64::LDRWroX: 976 case AArch64::LDRXroW: 977 case AArch64::LDRXroX: 978 case AArch64::PRFMroW: 979 case AArch64::PRFMroX: 980 case AArch64::STRBBroW: 981 case AArch64::STRBBroX: 982 case AArch64::STRBroW: 983 case AArch64::STRBroX: 984 case AArch64::STRDroW: 985 case AArch64::STRDroX: 986 case AArch64::STRHHroW: 987 case AArch64::STRHHroX: 988 case AArch64::STRHroW: 989 case AArch64::STRHroX: 990 case AArch64::STRQroW: 991 case AArch64::STRQroX: 992 case AArch64::STRSroW: 993 case AArch64::STRSroX: 994 case AArch64::STRWroW: 995 case AArch64::STRWroX: 996 case AArch64::STRXroW: 997 case AArch64::STRXroX: { 998 unsigned IsSigned = MI.getOperand(3).getImm(); 999 return !IsSigned; 1000 } 1001 } 1002 } 1003 1004 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1005 unsigned Opc = MI.getOpcode(); 1006 switch (Opc) { 1007 default: 1008 return false; 1009 case AArch64::SEH_StackAlloc: 1010 case AArch64::SEH_SaveFPLR: 1011 case AArch64::SEH_SaveFPLR_X: 1012 case AArch64::SEH_SaveReg: 1013 case AArch64::SEH_SaveReg_X: 1014 case AArch64::SEH_SaveRegP: 1015 case AArch64::SEH_SaveRegP_X: 1016 case AArch64::SEH_SaveFReg: 1017 case AArch64::SEH_SaveFReg_X: 1018 case AArch64::SEH_SaveFRegP: 1019 case AArch64::SEH_SaveFRegP_X: 1020 case AArch64::SEH_SetFP: 1021 case AArch64::SEH_AddFP: 1022 case AArch64::SEH_Nop: 1023 case AArch64::SEH_PrologEnd: 1024 case AArch64::SEH_EpilogStart: 1025 case AArch64::SEH_EpilogEnd: 1026 case AArch64::SEH_PACSignLR: 1027 return true; 1028 } 1029 } 1030 1031 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1032 Register &SrcReg, Register &DstReg, 1033 unsigned &SubIdx) const { 1034 switch (MI.getOpcode()) { 1035 default: 1036 return false; 1037 case AArch64::SBFMXri: // aka sxtw 1038 case AArch64::UBFMXri: // aka uxtw 1039 // Check for the 32 -> 64 bit extension case, these instructions can do 1040 // much more. 1041 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1042 return false; 1043 // This is a signed or unsigned 32 -> 64 bit extension. 1044 SrcReg = MI.getOperand(1).getReg(); 1045 DstReg = MI.getOperand(0).getReg(); 1046 SubIdx = AArch64::sub_32; 1047 return true; 1048 } 1049 } 1050 1051 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1052 const MachineInstr &MIa, const MachineInstr &MIb) const { 1053 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1054 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1055 int64_t OffsetA = 0, OffsetB = 0; 1056 unsigned WidthA = 0, WidthB = 0; 1057 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1058 1059 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1060 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1061 1062 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1063 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1064 return false; 1065 1066 // Retrieve the base, offset from the base and width. Width 1067 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1068 // base are identical, and the offset of a lower memory access + 1069 // the width doesn't overlap the offset of a higher memory access, 1070 // then the memory accesses are different. 1071 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1072 // are assumed to have the same scale (vscale). 1073 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1074 WidthA, TRI) && 1075 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1076 WidthB, TRI)) { 1077 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1078 OffsetAIsScalable == OffsetBIsScalable) { 1079 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1080 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1081 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1082 if (LowOffset + LowWidth <= HighOffset) 1083 return true; 1084 } 1085 } 1086 return false; 1087 } 1088 1089 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1090 const MachineBasicBlock *MBB, 1091 const MachineFunction &MF) const { 1092 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1093 return true; 1094 switch (MI.getOpcode()) { 1095 case AArch64::HINT: 1096 // CSDB hints are scheduling barriers. 1097 if (MI.getOperand(0).getImm() == 0x14) 1098 return true; 1099 break; 1100 case AArch64::DSB: 1101 case AArch64::ISB: 1102 // DSB and ISB also are scheduling barriers. 1103 return true; 1104 case AArch64::MSRpstatesvcrImm1: 1105 // SMSTART and SMSTOP are also scheduling barriers. 1106 return true; 1107 default:; 1108 } 1109 if (isSEHInstruction(MI)) 1110 return true; 1111 auto Next = std::next(MI.getIterator()); 1112 return Next != MBB->end() && Next->isCFIInstruction(); 1113 } 1114 1115 /// analyzeCompare - For a comparison instruction, return the source registers 1116 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1117 /// Return true if the comparison instruction can be analyzed. 1118 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1119 Register &SrcReg2, int64_t &CmpMask, 1120 int64_t &CmpValue) const { 1121 // The first operand can be a frame index where we'd normally expect a 1122 // register. 1123 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1124 if (!MI.getOperand(1).isReg()) 1125 return false; 1126 1127 switch (MI.getOpcode()) { 1128 default: 1129 break; 1130 case AArch64::PTEST_PP: 1131 case AArch64::PTEST_PP_ANY: 1132 SrcReg = MI.getOperand(0).getReg(); 1133 SrcReg2 = MI.getOperand(1).getReg(); 1134 // Not sure about the mask and value for now... 1135 CmpMask = ~0; 1136 CmpValue = 0; 1137 return true; 1138 case AArch64::SUBSWrr: 1139 case AArch64::SUBSWrs: 1140 case AArch64::SUBSWrx: 1141 case AArch64::SUBSXrr: 1142 case AArch64::SUBSXrs: 1143 case AArch64::SUBSXrx: 1144 case AArch64::ADDSWrr: 1145 case AArch64::ADDSWrs: 1146 case AArch64::ADDSWrx: 1147 case AArch64::ADDSXrr: 1148 case AArch64::ADDSXrs: 1149 case AArch64::ADDSXrx: 1150 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1151 SrcReg = MI.getOperand(1).getReg(); 1152 SrcReg2 = MI.getOperand(2).getReg(); 1153 CmpMask = ~0; 1154 CmpValue = 0; 1155 return true; 1156 case AArch64::SUBSWri: 1157 case AArch64::ADDSWri: 1158 case AArch64::SUBSXri: 1159 case AArch64::ADDSXri: 1160 SrcReg = MI.getOperand(1).getReg(); 1161 SrcReg2 = 0; 1162 CmpMask = ~0; 1163 CmpValue = MI.getOperand(2).getImm(); 1164 return true; 1165 case AArch64::ANDSWri: 1166 case AArch64::ANDSXri: 1167 // ANDS does not use the same encoding scheme as the others xxxS 1168 // instructions. 1169 SrcReg = MI.getOperand(1).getReg(); 1170 SrcReg2 = 0; 1171 CmpMask = ~0; 1172 CmpValue = AArch64_AM::decodeLogicalImmediate( 1173 MI.getOperand(2).getImm(), 1174 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1175 return true; 1176 } 1177 1178 return false; 1179 } 1180 1181 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1182 MachineBasicBlock *MBB = Instr.getParent(); 1183 assert(MBB && "Can't get MachineBasicBlock here"); 1184 MachineFunction *MF = MBB->getParent(); 1185 assert(MF && "Can't get MachineFunction here"); 1186 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1187 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1188 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1189 1190 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1191 ++OpIdx) { 1192 MachineOperand &MO = Instr.getOperand(OpIdx); 1193 const TargetRegisterClass *OpRegCstraints = 1194 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1195 1196 // If there's no constraint, there's nothing to do. 1197 if (!OpRegCstraints) 1198 continue; 1199 // If the operand is a frame index, there's nothing to do here. 1200 // A frame index operand will resolve correctly during PEI. 1201 if (MO.isFI()) 1202 continue; 1203 1204 assert(MO.isReg() && 1205 "Operand has register constraints without being a register!"); 1206 1207 Register Reg = MO.getReg(); 1208 if (Reg.isPhysical()) { 1209 if (!OpRegCstraints->contains(Reg)) 1210 return false; 1211 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1212 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1213 return false; 1214 } 1215 1216 return true; 1217 } 1218 1219 /// Return the opcode that does not set flags when possible - otherwise 1220 /// return the original opcode. The caller is responsible to do the actual 1221 /// substitution and legality checking. 1222 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1223 // Don't convert all compare instructions, because for some the zero register 1224 // encoding becomes the sp register. 1225 bool MIDefinesZeroReg = false; 1226 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1227 MIDefinesZeroReg = true; 1228 1229 switch (MI.getOpcode()) { 1230 default: 1231 return MI.getOpcode(); 1232 case AArch64::ADDSWrr: 1233 return AArch64::ADDWrr; 1234 case AArch64::ADDSWri: 1235 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1236 case AArch64::ADDSWrs: 1237 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1238 case AArch64::ADDSWrx: 1239 return AArch64::ADDWrx; 1240 case AArch64::ADDSXrr: 1241 return AArch64::ADDXrr; 1242 case AArch64::ADDSXri: 1243 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1244 case AArch64::ADDSXrs: 1245 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1246 case AArch64::ADDSXrx: 1247 return AArch64::ADDXrx; 1248 case AArch64::SUBSWrr: 1249 return AArch64::SUBWrr; 1250 case AArch64::SUBSWri: 1251 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1252 case AArch64::SUBSWrs: 1253 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1254 case AArch64::SUBSWrx: 1255 return AArch64::SUBWrx; 1256 case AArch64::SUBSXrr: 1257 return AArch64::SUBXrr; 1258 case AArch64::SUBSXri: 1259 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1260 case AArch64::SUBSXrs: 1261 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1262 case AArch64::SUBSXrx: 1263 return AArch64::SUBXrx; 1264 } 1265 } 1266 1267 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1268 1269 /// True when condition flags are accessed (either by writing or reading) 1270 /// on the instruction trace starting at From and ending at To. 1271 /// 1272 /// Note: If From and To are from different blocks it's assumed CC are accessed 1273 /// on the path. 1274 static bool areCFlagsAccessedBetweenInstrs( 1275 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1276 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1277 // Early exit if To is at the beginning of the BB. 1278 if (To == To->getParent()->begin()) 1279 return true; 1280 1281 // Check whether the instructions are in the same basic block 1282 // If not, assume the condition flags might get modified somewhere. 1283 if (To->getParent() != From->getParent()) 1284 return true; 1285 1286 // From must be above To. 1287 assert(std::any_of( 1288 ++To.getReverse(), To->getParent()->rend(), 1289 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1290 1291 // We iterate backward starting at \p To until we hit \p From. 1292 for (const MachineInstr &Instr : 1293 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1294 if (((AccessToCheck & AK_Write) && 1295 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1296 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1297 return true; 1298 } 1299 return false; 1300 } 1301 1302 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1303 /// operation which could set the flags in an identical manner 1304 bool AArch64InstrInfo::optimizePTestInstr( 1305 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1306 const MachineRegisterInfo *MRI) const { 1307 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1308 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1309 auto NewOp = Pred->getOpcode(); 1310 bool OpChanged = false; 1311 1312 unsigned MaskOpcode = Mask->getOpcode(); 1313 unsigned PredOpcode = Pred->getOpcode(); 1314 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1315 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1316 1317 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && 1318 getElementSizeForOpcode(MaskOpcode) == 1319 getElementSizeForOpcode(PredOpcode) && 1320 Mask->getOperand(1).getImm() == 31) { 1321 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1322 // redundant since WHILE performs an implicit PTEST with an all active 1323 // mask. Must be an all active predicate of matching element size. 1324 1325 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1326 // PTEST_LIKE instruction uses the same all active mask and the element 1327 // size matches. If the PTEST has a condition of any then it is always 1328 // redundant. 1329 if (PredIsPTestLike) { 1330 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1331 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) 1332 return false; 1333 } 1334 1335 // Fallthough to simply remove the PTEST. 1336 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && 1337 PTest->getOpcode() == AArch64::PTEST_PP_ANY) { 1338 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1339 // instruction that sets the flags as PTEST would. This is only valid when 1340 // the condition is any. 1341 1342 // Fallthough to simply remove the PTEST. 1343 } else if (PredIsPTestLike) { 1344 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1345 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1346 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1347 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1348 // performed by the compare could consider fewer lanes for these element 1349 // sizes. 1350 // 1351 // For example, consider 1352 // 1353 // ptrue p0.b ; P0=1111-1111-1111-1111 1354 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1355 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1356 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1357 // ; ^ last active 1358 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1359 // ; ^ last active 1360 // 1361 // where the compare generates a canonical all active 32-bit predicate 1362 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1363 // active flag, whereas the PTEST instruction with the same mask doesn't. 1364 // For PTEST_ANY this doesn't apply as the flags in this case would be 1365 // identical regardless of element size. 1366 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1367 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1368 if ((Mask != PTestLikeMask) || 1369 (PredElementSize != AArch64::ElementSizeB && 1370 PTest->getOpcode() != AArch64::PTEST_PP_ANY)) 1371 return false; 1372 1373 // Fallthough to simply remove the PTEST. 1374 } else { 1375 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1376 // opcode so the PTEST becomes redundant. 1377 switch (PredOpcode) { 1378 case AArch64::AND_PPzPP: 1379 case AArch64::BIC_PPzPP: 1380 case AArch64::EOR_PPzPP: 1381 case AArch64::NAND_PPzPP: 1382 case AArch64::NOR_PPzPP: 1383 case AArch64::ORN_PPzPP: 1384 case AArch64::ORR_PPzPP: 1385 case AArch64::BRKA_PPzP: 1386 case AArch64::BRKPA_PPzPP: 1387 case AArch64::BRKB_PPzP: 1388 case AArch64::BRKPB_PPzPP: 1389 case AArch64::RDFFR_PPz: { 1390 // Check to see if our mask is the same. If not the resulting flag bits 1391 // may be different and we can't remove the ptest. 1392 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1393 if (Mask != PredMask) 1394 return false; 1395 break; 1396 } 1397 case AArch64::BRKN_PPzP: { 1398 // BRKN uses an all active implicit mask to set flags unlike the other 1399 // flag-setting instructions. 1400 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1401 if ((MaskOpcode != AArch64::PTRUE_B) || 1402 (Mask->getOperand(1).getImm() != 31)) 1403 return false; 1404 break; 1405 } 1406 case AArch64::PTRUE_B: 1407 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1408 break; 1409 default: 1410 // Bail out if we don't recognize the input 1411 return false; 1412 } 1413 1414 NewOp = convertToFlagSettingOpc(PredOpcode); 1415 OpChanged = true; 1416 } 1417 1418 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1419 1420 // If another instruction between Pred and PTest accesses flags, don't remove 1421 // the ptest or update the earlier instruction to modify them. 1422 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1423 return false; 1424 1425 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1426 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1427 // operand to be replaced with an equivalent instruction that also sets the 1428 // flags. 1429 Pred->setDesc(get(NewOp)); 1430 PTest->eraseFromParent(); 1431 if (OpChanged) { 1432 bool succeeded = UpdateOperandRegClass(*Pred); 1433 (void)succeeded; 1434 assert(succeeded && "Operands have incompatible register classes!"); 1435 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1436 } 1437 1438 // Ensure that the flags def is live. 1439 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1440 unsigned i = 0, e = Pred->getNumOperands(); 1441 for (; i != e; ++i) { 1442 MachineOperand &MO = Pred->getOperand(i); 1443 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1444 MO.setIsDead(false); 1445 break; 1446 } 1447 } 1448 } 1449 return true; 1450 } 1451 1452 /// Try to optimize a compare instruction. A compare instruction is an 1453 /// instruction which produces AArch64::NZCV. It can be truly compare 1454 /// instruction 1455 /// when there are no uses of its destination register. 1456 /// 1457 /// The following steps are tried in order: 1458 /// 1. Convert CmpInstr into an unconditional version. 1459 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1460 /// condition code or an instruction which can be converted into such an 1461 /// instruction. 1462 /// Only comparison with zero is supported. 1463 bool AArch64InstrInfo::optimizeCompareInstr( 1464 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1465 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1466 assert(CmpInstr.getParent()); 1467 assert(MRI); 1468 1469 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1470 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1471 if (DeadNZCVIdx != -1) { 1472 if (CmpInstr.definesRegister(AArch64::WZR) || 1473 CmpInstr.definesRegister(AArch64::XZR)) { 1474 CmpInstr.eraseFromParent(); 1475 return true; 1476 } 1477 unsigned Opc = CmpInstr.getOpcode(); 1478 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1479 if (NewOpc == Opc) 1480 return false; 1481 const MCInstrDesc &MCID = get(NewOpc); 1482 CmpInstr.setDesc(MCID); 1483 CmpInstr.removeOperand(DeadNZCVIdx); 1484 bool succeeded = UpdateOperandRegClass(CmpInstr); 1485 (void)succeeded; 1486 assert(succeeded && "Some operands reg class are incompatible!"); 1487 return true; 1488 } 1489 1490 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1491 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1492 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1493 1494 if (SrcReg2 != 0) 1495 return false; 1496 1497 // CmpInstr is a Compare instruction if destination register is not used. 1498 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1499 return false; 1500 1501 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1502 return true; 1503 return (CmpValue == 0 || CmpValue == 1) && 1504 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1505 } 1506 1507 /// Get opcode of S version of Instr. 1508 /// If Instr is S version its opcode is returned. 1509 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1510 /// or we are not interested in it. 1511 static unsigned sForm(MachineInstr &Instr) { 1512 switch (Instr.getOpcode()) { 1513 default: 1514 return AArch64::INSTRUCTION_LIST_END; 1515 1516 case AArch64::ADDSWrr: 1517 case AArch64::ADDSWri: 1518 case AArch64::ADDSXrr: 1519 case AArch64::ADDSXri: 1520 case AArch64::SUBSWrr: 1521 case AArch64::SUBSWri: 1522 case AArch64::SUBSXrr: 1523 case AArch64::SUBSXri: 1524 return Instr.getOpcode(); 1525 1526 case AArch64::ADDWrr: 1527 return AArch64::ADDSWrr; 1528 case AArch64::ADDWri: 1529 return AArch64::ADDSWri; 1530 case AArch64::ADDXrr: 1531 return AArch64::ADDSXrr; 1532 case AArch64::ADDXri: 1533 return AArch64::ADDSXri; 1534 case AArch64::ADCWr: 1535 return AArch64::ADCSWr; 1536 case AArch64::ADCXr: 1537 return AArch64::ADCSXr; 1538 case AArch64::SUBWrr: 1539 return AArch64::SUBSWrr; 1540 case AArch64::SUBWri: 1541 return AArch64::SUBSWri; 1542 case AArch64::SUBXrr: 1543 return AArch64::SUBSXrr; 1544 case AArch64::SUBXri: 1545 return AArch64::SUBSXri; 1546 case AArch64::SBCWr: 1547 return AArch64::SBCSWr; 1548 case AArch64::SBCXr: 1549 return AArch64::SBCSXr; 1550 case AArch64::ANDWri: 1551 return AArch64::ANDSWri; 1552 case AArch64::ANDXri: 1553 return AArch64::ANDSXri; 1554 } 1555 } 1556 1557 /// Check if AArch64::NZCV should be alive in successors of MBB. 1558 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1559 for (auto *BB : MBB->successors()) 1560 if (BB->isLiveIn(AArch64::NZCV)) 1561 return true; 1562 return false; 1563 } 1564 1565 /// \returns The condition code operand index for \p Instr if it is a branch 1566 /// or select and -1 otherwise. 1567 static int 1568 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1569 switch (Instr.getOpcode()) { 1570 default: 1571 return -1; 1572 1573 case AArch64::Bcc: { 1574 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1575 assert(Idx >= 2); 1576 return Idx - 2; 1577 } 1578 1579 case AArch64::CSINVWr: 1580 case AArch64::CSINVXr: 1581 case AArch64::CSINCWr: 1582 case AArch64::CSINCXr: 1583 case AArch64::CSELWr: 1584 case AArch64::CSELXr: 1585 case AArch64::CSNEGWr: 1586 case AArch64::CSNEGXr: 1587 case AArch64::FCSELSrrr: 1588 case AArch64::FCSELDrrr: { 1589 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1590 assert(Idx >= 1); 1591 return Idx - 1; 1592 } 1593 } 1594 } 1595 1596 /// Find a condition code used by the instruction. 1597 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1598 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1599 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1600 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1601 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1602 Instr.getOperand(CCIdx).getImm()) 1603 : AArch64CC::Invalid; 1604 } 1605 1606 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1607 assert(CC != AArch64CC::Invalid); 1608 UsedNZCV UsedFlags; 1609 switch (CC) { 1610 default: 1611 break; 1612 1613 case AArch64CC::EQ: // Z set 1614 case AArch64CC::NE: // Z clear 1615 UsedFlags.Z = true; 1616 break; 1617 1618 case AArch64CC::HI: // Z clear and C set 1619 case AArch64CC::LS: // Z set or C clear 1620 UsedFlags.Z = true; 1621 [[fallthrough]]; 1622 case AArch64CC::HS: // C set 1623 case AArch64CC::LO: // C clear 1624 UsedFlags.C = true; 1625 break; 1626 1627 case AArch64CC::MI: // N set 1628 case AArch64CC::PL: // N clear 1629 UsedFlags.N = true; 1630 break; 1631 1632 case AArch64CC::VS: // V set 1633 case AArch64CC::VC: // V clear 1634 UsedFlags.V = true; 1635 break; 1636 1637 case AArch64CC::GT: // Z clear, N and V the same 1638 case AArch64CC::LE: // Z set, N and V differ 1639 UsedFlags.Z = true; 1640 [[fallthrough]]; 1641 case AArch64CC::GE: // N and V the same 1642 case AArch64CC::LT: // N and V differ 1643 UsedFlags.N = true; 1644 UsedFlags.V = true; 1645 break; 1646 } 1647 return UsedFlags; 1648 } 1649 1650 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1651 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1652 /// \returns std::nullopt otherwise. 1653 /// 1654 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1655 std::optional<UsedNZCV> 1656 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1657 const TargetRegisterInfo &TRI, 1658 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1659 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1660 if (MI.getParent() != CmpParent) 1661 return std::nullopt; 1662 1663 if (areCFlagsAliveInSuccessors(CmpParent)) 1664 return std::nullopt; 1665 1666 UsedNZCV NZCVUsedAfterCmp; 1667 for (MachineInstr &Instr : instructionsWithoutDebug( 1668 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1669 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1670 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1671 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1672 return std::nullopt; 1673 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1674 if (CCUseInstrs) 1675 CCUseInstrs->push_back(&Instr); 1676 } 1677 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1678 break; 1679 } 1680 return NZCVUsedAfterCmp; 1681 } 1682 1683 static bool isADDSRegImm(unsigned Opcode) { 1684 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1685 } 1686 1687 static bool isSUBSRegImm(unsigned Opcode) { 1688 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1689 } 1690 1691 /// Check if CmpInstr can be substituted by MI. 1692 /// 1693 /// CmpInstr can be substituted: 1694 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1695 /// - and, MI and CmpInstr are from the same MachineBB 1696 /// - and, condition flags are not alive in successors of the CmpInstr parent 1697 /// - and, if MI opcode is the S form there must be no defs of flags between 1698 /// MI and CmpInstr 1699 /// or if MI opcode is not the S form there must be neither defs of flags 1700 /// nor uses of flags between MI and CmpInstr. 1701 /// - and, if C/V flags are not used after CmpInstr 1702 /// or if N flag is used but MI produces poison value if signed overflow 1703 /// occurs. 1704 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1705 const TargetRegisterInfo &TRI) { 1706 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction 1707 // that may or may not set flags. 1708 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1709 1710 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1711 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1712 return false; 1713 1714 assert((CmpInstr.getOperand(2).isImm() && 1715 CmpInstr.getOperand(2).getImm() == 0) && 1716 "Caller guarantees that CmpInstr compares with constant 0"); 1717 1718 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1719 if (!NZVCUsed || NZVCUsed->C) 1720 return false; 1721 1722 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either 1723 // '%vreg = add ...' or '%vreg = sub ...'. 1724 // Condition flag V is used to indicate signed overflow. 1725 // 1) MI and CmpInstr set N and V to the same value. 1726 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when 1727 // signed overflow occurs, so CmpInstr could still be simplified away. 1728 if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) 1729 return false; 1730 1731 AccessKind AccessToCheck = AK_Write; 1732 if (sForm(MI) != MI.getOpcode()) 1733 AccessToCheck = AK_All; 1734 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1735 } 1736 1737 /// Substitute an instruction comparing to zero with another instruction 1738 /// which produces needed condition flags. 1739 /// 1740 /// Return true on success. 1741 bool AArch64InstrInfo::substituteCmpToZero( 1742 MachineInstr &CmpInstr, unsigned SrcReg, 1743 const MachineRegisterInfo &MRI) const { 1744 // Get the unique definition of SrcReg. 1745 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1746 if (!MI) 1747 return false; 1748 1749 const TargetRegisterInfo &TRI = getRegisterInfo(); 1750 1751 unsigned NewOpc = sForm(*MI); 1752 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1753 return false; 1754 1755 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1756 return false; 1757 1758 // Update the instruction to set NZCV. 1759 MI->setDesc(get(NewOpc)); 1760 CmpInstr.eraseFromParent(); 1761 bool succeeded = UpdateOperandRegClass(*MI); 1762 (void)succeeded; 1763 assert(succeeded && "Some operands reg class are incompatible!"); 1764 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1765 return true; 1766 } 1767 1768 /// \returns True if \p CmpInstr can be removed. 1769 /// 1770 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1771 /// codes used in \p CCUseInstrs must be inverted. 1772 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1773 int CmpValue, const TargetRegisterInfo &TRI, 1774 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1775 bool &IsInvertCC) { 1776 assert((CmpValue == 0 || CmpValue == 1) && 1777 "Only comparisons to 0 or 1 considered for removal!"); 1778 1779 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1780 unsigned MIOpc = MI.getOpcode(); 1781 if (MIOpc == AArch64::CSINCWr) { 1782 if (MI.getOperand(1).getReg() != AArch64::WZR || 1783 MI.getOperand(2).getReg() != AArch64::WZR) 1784 return false; 1785 } else if (MIOpc == AArch64::CSINCXr) { 1786 if (MI.getOperand(1).getReg() != AArch64::XZR || 1787 MI.getOperand(2).getReg() != AArch64::XZR) 1788 return false; 1789 } else { 1790 return false; 1791 } 1792 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1793 if (MICC == AArch64CC::Invalid) 1794 return false; 1795 1796 // NZCV needs to be defined 1797 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1798 return false; 1799 1800 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1801 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1802 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1803 if (CmpValue && !IsSubsRegImm) 1804 return false; 1805 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1806 return false; 1807 1808 // MI conditions allowed: eq, ne, mi, pl 1809 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1810 if (MIUsedNZCV.C || MIUsedNZCV.V) 1811 return false; 1812 1813 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1814 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1815 // Condition flags are not used in CmpInstr basic block successors and only 1816 // Z or N flags allowed to be used after CmpInstr within its basic block 1817 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1818 return false; 1819 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1820 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1821 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1822 return false; 1823 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1824 if (MIUsedNZCV.N && !CmpValue) 1825 return false; 1826 1827 // There must be no defs of flags between MI and CmpInstr 1828 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1829 return false; 1830 1831 // Condition code is inverted in the following cases: 1832 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1833 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1834 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1835 (!CmpValue && MICC == AArch64CC::NE); 1836 return true; 1837 } 1838 1839 /// Remove comparison in csinc-cmp sequence 1840 /// 1841 /// Examples: 1842 /// 1. \code 1843 /// csinc w9, wzr, wzr, ne 1844 /// cmp w9, #0 1845 /// b.eq 1846 /// \endcode 1847 /// to 1848 /// \code 1849 /// csinc w9, wzr, wzr, ne 1850 /// b.ne 1851 /// \endcode 1852 /// 1853 /// 2. \code 1854 /// csinc x2, xzr, xzr, mi 1855 /// cmp x2, #1 1856 /// b.pl 1857 /// \endcode 1858 /// to 1859 /// \code 1860 /// csinc x2, xzr, xzr, mi 1861 /// b.pl 1862 /// \endcode 1863 /// 1864 /// \param CmpInstr comparison instruction 1865 /// \return True when comparison removed 1866 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1867 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1868 const MachineRegisterInfo &MRI) const { 1869 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1870 if (!MI) 1871 return false; 1872 const TargetRegisterInfo &TRI = getRegisterInfo(); 1873 SmallVector<MachineInstr *, 4> CCUseInstrs; 1874 bool IsInvertCC = false; 1875 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1876 IsInvertCC)) 1877 return false; 1878 // Make transformation 1879 CmpInstr.eraseFromParent(); 1880 if (IsInvertCC) { 1881 // Invert condition codes in CmpInstr CC users 1882 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1883 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1884 assert(Idx >= 0 && "Unexpected instruction using CC."); 1885 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1886 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1887 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1888 CCOperand.setImm(CCUse); 1889 } 1890 } 1891 return true; 1892 } 1893 1894 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1895 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1896 MI.getOpcode() != AArch64::CATCHRET) 1897 return false; 1898 1899 MachineBasicBlock &MBB = *MI.getParent(); 1900 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1901 auto TRI = Subtarget.getRegisterInfo(); 1902 DebugLoc DL = MI.getDebugLoc(); 1903 1904 if (MI.getOpcode() == AArch64::CATCHRET) { 1905 // Skip to the first instruction before the epilog. 1906 const TargetInstrInfo *TII = 1907 MBB.getParent()->getSubtarget().getInstrInfo(); 1908 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1909 auto MBBI = MachineBasicBlock::iterator(MI); 1910 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1911 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1912 FirstEpilogSEH != MBB.begin()) 1913 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1914 if (FirstEpilogSEH != MBB.begin()) 1915 FirstEpilogSEH = std::next(FirstEpilogSEH); 1916 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1917 .addReg(AArch64::X0, RegState::Define) 1918 .addMBB(TargetMBB); 1919 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1920 .addReg(AArch64::X0, RegState::Define) 1921 .addReg(AArch64::X0) 1922 .addMBB(TargetMBB) 1923 .addImm(0); 1924 return true; 1925 } 1926 1927 Register Reg = MI.getOperand(0).getReg(); 1928 Module &M = *MBB.getParent()->getFunction().getParent(); 1929 if (M.getStackProtectorGuard() == "sysreg") { 1930 const AArch64SysReg::SysReg *SrcReg = 1931 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1932 if (!SrcReg) 1933 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1934 1935 // mrs xN, sysreg 1936 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1937 .addDef(Reg, RegState::Renamable) 1938 .addImm(SrcReg->Encoding); 1939 int Offset = M.getStackProtectorGuardOffset(); 1940 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1941 // ldr xN, [xN, #offset] 1942 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1943 .addDef(Reg) 1944 .addUse(Reg, RegState::Kill) 1945 .addImm(Offset / 8); 1946 } else if (Offset >= -256 && Offset <= 255) { 1947 // ldur xN, [xN, #offset] 1948 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1949 .addDef(Reg) 1950 .addUse(Reg, RegState::Kill) 1951 .addImm(Offset); 1952 } else if (Offset >= -4095 && Offset <= 4095) { 1953 if (Offset > 0) { 1954 // add xN, xN, #offset 1955 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1956 .addDef(Reg) 1957 .addUse(Reg, RegState::Kill) 1958 .addImm(Offset) 1959 .addImm(0); 1960 } else { 1961 // sub xN, xN, #offset 1962 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1963 .addDef(Reg) 1964 .addUse(Reg, RegState::Kill) 1965 .addImm(-Offset) 1966 .addImm(0); 1967 } 1968 // ldr xN, [xN] 1969 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1970 .addDef(Reg) 1971 .addUse(Reg, RegState::Kill) 1972 .addImm(0); 1973 } else { 1974 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1975 // than 23760. 1976 // It might be nice to use AArch64::MOVi32imm here, which would get 1977 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1978 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1979 // AArch64FrameLowering might help us find such a scratch register 1980 // though. If we failed to find a scratch register, we could emit a 1981 // stream of add instructions to build up the immediate. Or, we could try 1982 // to insert a AArch64::MOVi32imm before register allocation so that we 1983 // didn't need to scavenge for a scratch register. 1984 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1985 } 1986 MBB.erase(MI); 1987 return true; 1988 } 1989 1990 const GlobalValue *GV = 1991 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1992 const TargetMachine &TM = MBB.getParent()->getTarget(); 1993 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1994 const unsigned char MO_NC = AArch64II::MO_NC; 1995 1996 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1997 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1998 .addGlobalAddress(GV, 0, OpFlags); 1999 if (Subtarget.isTargetILP32()) { 2000 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2001 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2002 .addDef(Reg32, RegState::Dead) 2003 .addUse(Reg, RegState::Kill) 2004 .addImm(0) 2005 .addMemOperand(*MI.memoperands_begin()) 2006 .addDef(Reg, RegState::Implicit); 2007 } else { 2008 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2009 .addReg(Reg, RegState::Kill) 2010 .addImm(0) 2011 .addMemOperand(*MI.memoperands_begin()); 2012 } 2013 } else if (TM.getCodeModel() == CodeModel::Large) { 2014 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 2015 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 2016 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2017 .addImm(0); 2018 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2019 .addReg(Reg, RegState::Kill) 2020 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2021 .addImm(16); 2022 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2023 .addReg(Reg, RegState::Kill) 2024 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2025 .addImm(32); 2026 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2027 .addReg(Reg, RegState::Kill) 2028 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2029 .addImm(48); 2030 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2031 .addReg(Reg, RegState::Kill) 2032 .addImm(0) 2033 .addMemOperand(*MI.memoperands_begin()); 2034 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2035 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2036 .addGlobalAddress(GV, 0, OpFlags); 2037 } else { 2038 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2039 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2040 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2041 if (Subtarget.isTargetILP32()) { 2042 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2043 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2044 .addDef(Reg32, RegState::Dead) 2045 .addUse(Reg, RegState::Kill) 2046 .addGlobalAddress(GV, 0, LoFlags) 2047 .addMemOperand(*MI.memoperands_begin()) 2048 .addDef(Reg, RegState::Implicit); 2049 } else { 2050 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2051 .addReg(Reg, RegState::Kill) 2052 .addGlobalAddress(GV, 0, LoFlags) 2053 .addMemOperand(*MI.memoperands_begin()); 2054 } 2055 } 2056 2057 MBB.erase(MI); 2058 2059 return true; 2060 } 2061 2062 // Return true if this instruction simply sets its single destination register 2063 // to zero. This is equivalent to a register rename of the zero-register. 2064 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2065 switch (MI.getOpcode()) { 2066 default: 2067 break; 2068 case AArch64::MOVZWi: 2069 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2070 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2071 assert(MI.getDesc().getNumOperands() == 3 && 2072 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2073 return true; 2074 } 2075 break; 2076 case AArch64::ANDWri: // and Rd, Rzr, #imm 2077 return MI.getOperand(1).getReg() == AArch64::WZR; 2078 case AArch64::ANDXri: 2079 return MI.getOperand(1).getReg() == AArch64::XZR; 2080 case TargetOpcode::COPY: 2081 return MI.getOperand(1).getReg() == AArch64::WZR; 2082 } 2083 return false; 2084 } 2085 2086 // Return true if this instruction simply renames a general register without 2087 // modifying bits. 2088 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2089 switch (MI.getOpcode()) { 2090 default: 2091 break; 2092 case TargetOpcode::COPY: { 2093 // GPR32 copies will by lowered to ORRXrs 2094 Register DstReg = MI.getOperand(0).getReg(); 2095 return (AArch64::GPR32RegClass.contains(DstReg) || 2096 AArch64::GPR64RegClass.contains(DstReg)); 2097 } 2098 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2099 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2100 assert(MI.getDesc().getNumOperands() == 4 && 2101 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2102 return true; 2103 } 2104 break; 2105 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2106 if (MI.getOperand(2).getImm() == 0) { 2107 assert(MI.getDesc().getNumOperands() == 4 && 2108 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2109 return true; 2110 } 2111 break; 2112 } 2113 return false; 2114 } 2115 2116 // Return true if this instruction simply renames a general register without 2117 // modifying bits. 2118 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2119 switch (MI.getOpcode()) { 2120 default: 2121 break; 2122 case TargetOpcode::COPY: { 2123 Register DstReg = MI.getOperand(0).getReg(); 2124 return AArch64::FPR128RegClass.contains(DstReg); 2125 } 2126 case AArch64::ORRv16i8: 2127 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2128 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2129 "invalid ORRv16i8 operands"); 2130 return true; 2131 } 2132 break; 2133 } 2134 return false; 2135 } 2136 2137 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2138 int &FrameIndex) const { 2139 switch (MI.getOpcode()) { 2140 default: 2141 break; 2142 case AArch64::LDRWui: 2143 case AArch64::LDRXui: 2144 case AArch64::LDRBui: 2145 case AArch64::LDRHui: 2146 case AArch64::LDRSui: 2147 case AArch64::LDRDui: 2148 case AArch64::LDRQui: 2149 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2150 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2151 FrameIndex = MI.getOperand(1).getIndex(); 2152 return MI.getOperand(0).getReg(); 2153 } 2154 break; 2155 } 2156 2157 return 0; 2158 } 2159 2160 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2161 int &FrameIndex) const { 2162 switch (MI.getOpcode()) { 2163 default: 2164 break; 2165 case AArch64::STRWui: 2166 case AArch64::STRXui: 2167 case AArch64::STRBui: 2168 case AArch64::STRHui: 2169 case AArch64::STRSui: 2170 case AArch64::STRDui: 2171 case AArch64::STRQui: 2172 case AArch64::LDR_PXI: 2173 case AArch64::STR_PXI: 2174 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2175 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2176 FrameIndex = MI.getOperand(1).getIndex(); 2177 return MI.getOperand(0).getReg(); 2178 } 2179 break; 2180 } 2181 return 0; 2182 } 2183 2184 /// Check all MachineMemOperands for a hint to suppress pairing. 2185 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2186 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2187 return MMO->getFlags() & MOSuppressPair; 2188 }); 2189 } 2190 2191 /// Set a flag on the first MachineMemOperand to suppress pairing. 2192 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2193 if (MI.memoperands_empty()) 2194 return; 2195 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2196 } 2197 2198 /// Check all MachineMemOperands for a hint that the load/store is strided. 2199 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2200 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2201 return MMO->getFlags() & MOStridedAccess; 2202 }); 2203 } 2204 2205 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2206 switch (Opc) { 2207 default: 2208 return false; 2209 case AArch64::STURSi: 2210 case AArch64::STRSpre: 2211 case AArch64::STURDi: 2212 case AArch64::STRDpre: 2213 case AArch64::STURQi: 2214 case AArch64::STRQpre: 2215 case AArch64::STURBBi: 2216 case AArch64::STURHHi: 2217 case AArch64::STURWi: 2218 case AArch64::STRWpre: 2219 case AArch64::STURXi: 2220 case AArch64::STRXpre: 2221 case AArch64::LDURSi: 2222 case AArch64::LDRSpre: 2223 case AArch64::LDURDi: 2224 case AArch64::LDRDpre: 2225 case AArch64::LDURQi: 2226 case AArch64::LDRQpre: 2227 case AArch64::LDURWi: 2228 case AArch64::LDRWpre: 2229 case AArch64::LDURXi: 2230 case AArch64::LDRXpre: 2231 case AArch64::LDURSWi: 2232 case AArch64::LDURHHi: 2233 case AArch64::LDURBBi: 2234 case AArch64::LDURSBWi: 2235 case AArch64::LDURSHWi: 2236 return true; 2237 } 2238 } 2239 2240 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2241 switch (Opc) { 2242 default: return {}; 2243 case AArch64::PRFMui: return AArch64::PRFUMi; 2244 case AArch64::LDRXui: return AArch64::LDURXi; 2245 case AArch64::LDRWui: return AArch64::LDURWi; 2246 case AArch64::LDRBui: return AArch64::LDURBi; 2247 case AArch64::LDRHui: return AArch64::LDURHi; 2248 case AArch64::LDRSui: return AArch64::LDURSi; 2249 case AArch64::LDRDui: return AArch64::LDURDi; 2250 case AArch64::LDRQui: return AArch64::LDURQi; 2251 case AArch64::LDRBBui: return AArch64::LDURBBi; 2252 case AArch64::LDRHHui: return AArch64::LDURHHi; 2253 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2254 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2255 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2256 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2257 case AArch64::LDRSWui: return AArch64::LDURSWi; 2258 case AArch64::STRXui: return AArch64::STURXi; 2259 case AArch64::STRWui: return AArch64::STURWi; 2260 case AArch64::STRBui: return AArch64::STURBi; 2261 case AArch64::STRHui: return AArch64::STURHi; 2262 case AArch64::STRSui: return AArch64::STURSi; 2263 case AArch64::STRDui: return AArch64::STURDi; 2264 case AArch64::STRQui: return AArch64::STURQi; 2265 case AArch64::STRBBui: return AArch64::STURBBi; 2266 case AArch64::STRHHui: return AArch64::STURHHi; 2267 } 2268 } 2269 2270 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2271 switch (Opc) { 2272 default: 2273 return 2; 2274 case AArch64::LDPXi: 2275 case AArch64::LDPDi: 2276 case AArch64::STPXi: 2277 case AArch64::STPDi: 2278 case AArch64::LDNPXi: 2279 case AArch64::LDNPDi: 2280 case AArch64::STNPXi: 2281 case AArch64::STNPDi: 2282 case AArch64::LDPQi: 2283 case AArch64::STPQi: 2284 case AArch64::LDNPQi: 2285 case AArch64::STNPQi: 2286 case AArch64::LDPWi: 2287 case AArch64::LDPSi: 2288 case AArch64::STPWi: 2289 case AArch64::STPSi: 2290 case AArch64::LDNPWi: 2291 case AArch64::LDNPSi: 2292 case AArch64::STNPWi: 2293 case AArch64::STNPSi: 2294 case AArch64::LDG: 2295 case AArch64::STGPi: 2296 2297 case AArch64::LD1B_IMM: 2298 case AArch64::LD1B_H_IMM: 2299 case AArch64::LD1B_S_IMM: 2300 case AArch64::LD1B_D_IMM: 2301 case AArch64::LD1SB_H_IMM: 2302 case AArch64::LD1SB_S_IMM: 2303 case AArch64::LD1SB_D_IMM: 2304 case AArch64::LD1H_IMM: 2305 case AArch64::LD1H_S_IMM: 2306 case AArch64::LD1H_D_IMM: 2307 case AArch64::LD1SH_S_IMM: 2308 case AArch64::LD1SH_D_IMM: 2309 case AArch64::LD1W_IMM: 2310 case AArch64::LD1W_D_IMM: 2311 case AArch64::LD1SW_D_IMM: 2312 case AArch64::LD1D_IMM: 2313 2314 case AArch64::LD2B_IMM: 2315 case AArch64::LD2H_IMM: 2316 case AArch64::LD2W_IMM: 2317 case AArch64::LD2D_IMM: 2318 case AArch64::LD3B_IMM: 2319 case AArch64::LD3H_IMM: 2320 case AArch64::LD3W_IMM: 2321 case AArch64::LD3D_IMM: 2322 case AArch64::LD4B_IMM: 2323 case AArch64::LD4H_IMM: 2324 case AArch64::LD4W_IMM: 2325 case AArch64::LD4D_IMM: 2326 2327 case AArch64::ST1B_IMM: 2328 case AArch64::ST1B_H_IMM: 2329 case AArch64::ST1B_S_IMM: 2330 case AArch64::ST1B_D_IMM: 2331 case AArch64::ST1H_IMM: 2332 case AArch64::ST1H_S_IMM: 2333 case AArch64::ST1H_D_IMM: 2334 case AArch64::ST1W_IMM: 2335 case AArch64::ST1W_D_IMM: 2336 case AArch64::ST1D_IMM: 2337 2338 case AArch64::ST2B_IMM: 2339 case AArch64::ST2H_IMM: 2340 case AArch64::ST2W_IMM: 2341 case AArch64::ST2D_IMM: 2342 case AArch64::ST3B_IMM: 2343 case AArch64::ST3H_IMM: 2344 case AArch64::ST3W_IMM: 2345 case AArch64::ST3D_IMM: 2346 case AArch64::ST4B_IMM: 2347 case AArch64::ST4H_IMM: 2348 case AArch64::ST4W_IMM: 2349 case AArch64::ST4D_IMM: 2350 2351 case AArch64::LD1RB_IMM: 2352 case AArch64::LD1RB_H_IMM: 2353 case AArch64::LD1RB_S_IMM: 2354 case AArch64::LD1RB_D_IMM: 2355 case AArch64::LD1RSB_H_IMM: 2356 case AArch64::LD1RSB_S_IMM: 2357 case AArch64::LD1RSB_D_IMM: 2358 case AArch64::LD1RH_IMM: 2359 case AArch64::LD1RH_S_IMM: 2360 case AArch64::LD1RH_D_IMM: 2361 case AArch64::LD1RSH_S_IMM: 2362 case AArch64::LD1RSH_D_IMM: 2363 case AArch64::LD1RW_IMM: 2364 case AArch64::LD1RW_D_IMM: 2365 case AArch64::LD1RSW_IMM: 2366 case AArch64::LD1RD_IMM: 2367 2368 case AArch64::LDNT1B_ZRI: 2369 case AArch64::LDNT1H_ZRI: 2370 case AArch64::LDNT1W_ZRI: 2371 case AArch64::LDNT1D_ZRI: 2372 case AArch64::STNT1B_ZRI: 2373 case AArch64::STNT1H_ZRI: 2374 case AArch64::STNT1W_ZRI: 2375 case AArch64::STNT1D_ZRI: 2376 2377 case AArch64::LDNF1B_IMM: 2378 case AArch64::LDNF1B_H_IMM: 2379 case AArch64::LDNF1B_S_IMM: 2380 case AArch64::LDNF1B_D_IMM: 2381 case AArch64::LDNF1SB_H_IMM: 2382 case AArch64::LDNF1SB_S_IMM: 2383 case AArch64::LDNF1SB_D_IMM: 2384 case AArch64::LDNF1H_IMM: 2385 case AArch64::LDNF1H_S_IMM: 2386 case AArch64::LDNF1H_D_IMM: 2387 case AArch64::LDNF1SH_S_IMM: 2388 case AArch64::LDNF1SH_D_IMM: 2389 case AArch64::LDNF1W_IMM: 2390 case AArch64::LDNF1W_D_IMM: 2391 case AArch64::LDNF1SW_D_IMM: 2392 case AArch64::LDNF1D_IMM: 2393 return 3; 2394 case AArch64::ADDG: 2395 case AArch64::STGi: 2396 case AArch64::LDR_PXI: 2397 case AArch64::STR_PXI: 2398 return 2; 2399 } 2400 } 2401 2402 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2403 switch (MI.getOpcode()) { 2404 default: 2405 return false; 2406 // Scaled instructions. 2407 case AArch64::STRSui: 2408 case AArch64::STRDui: 2409 case AArch64::STRQui: 2410 case AArch64::STRXui: 2411 case AArch64::STRWui: 2412 case AArch64::LDRSui: 2413 case AArch64::LDRDui: 2414 case AArch64::LDRQui: 2415 case AArch64::LDRXui: 2416 case AArch64::LDRWui: 2417 case AArch64::LDRSWui: 2418 // Unscaled instructions. 2419 case AArch64::STURSi: 2420 case AArch64::STRSpre: 2421 case AArch64::STURDi: 2422 case AArch64::STRDpre: 2423 case AArch64::STURQi: 2424 case AArch64::STRQpre: 2425 case AArch64::STURWi: 2426 case AArch64::STRWpre: 2427 case AArch64::STURXi: 2428 case AArch64::STRXpre: 2429 case AArch64::LDURSi: 2430 case AArch64::LDRSpre: 2431 case AArch64::LDURDi: 2432 case AArch64::LDRDpre: 2433 case AArch64::LDURQi: 2434 case AArch64::LDRQpre: 2435 case AArch64::LDURWi: 2436 case AArch64::LDRWpre: 2437 case AArch64::LDURXi: 2438 case AArch64::LDRXpre: 2439 case AArch64::LDURSWi: 2440 return true; 2441 } 2442 } 2443 2444 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2445 switch (Opc) { 2446 default: 2447 llvm_unreachable("Opcode has no flag setting equivalent!"); 2448 // 32-bit cases: 2449 case AArch64::ADDWri: 2450 return AArch64::ADDSWri; 2451 case AArch64::ADDWrr: 2452 return AArch64::ADDSWrr; 2453 case AArch64::ADDWrs: 2454 return AArch64::ADDSWrs; 2455 case AArch64::ADDWrx: 2456 return AArch64::ADDSWrx; 2457 case AArch64::ANDWri: 2458 return AArch64::ANDSWri; 2459 case AArch64::ANDWrr: 2460 return AArch64::ANDSWrr; 2461 case AArch64::ANDWrs: 2462 return AArch64::ANDSWrs; 2463 case AArch64::BICWrr: 2464 return AArch64::BICSWrr; 2465 case AArch64::BICWrs: 2466 return AArch64::BICSWrs; 2467 case AArch64::SUBWri: 2468 return AArch64::SUBSWri; 2469 case AArch64::SUBWrr: 2470 return AArch64::SUBSWrr; 2471 case AArch64::SUBWrs: 2472 return AArch64::SUBSWrs; 2473 case AArch64::SUBWrx: 2474 return AArch64::SUBSWrx; 2475 // 64-bit cases: 2476 case AArch64::ADDXri: 2477 return AArch64::ADDSXri; 2478 case AArch64::ADDXrr: 2479 return AArch64::ADDSXrr; 2480 case AArch64::ADDXrs: 2481 return AArch64::ADDSXrs; 2482 case AArch64::ADDXrx: 2483 return AArch64::ADDSXrx; 2484 case AArch64::ANDXri: 2485 return AArch64::ANDSXri; 2486 case AArch64::ANDXrr: 2487 return AArch64::ANDSXrr; 2488 case AArch64::ANDXrs: 2489 return AArch64::ANDSXrs; 2490 case AArch64::BICXrr: 2491 return AArch64::BICSXrr; 2492 case AArch64::BICXrs: 2493 return AArch64::BICSXrs; 2494 case AArch64::SUBXri: 2495 return AArch64::SUBSXri; 2496 case AArch64::SUBXrr: 2497 return AArch64::SUBSXrr; 2498 case AArch64::SUBXrs: 2499 return AArch64::SUBSXrs; 2500 case AArch64::SUBXrx: 2501 return AArch64::SUBSXrx; 2502 // SVE instructions: 2503 case AArch64::AND_PPzPP: 2504 return AArch64::ANDS_PPzPP; 2505 case AArch64::BIC_PPzPP: 2506 return AArch64::BICS_PPzPP; 2507 case AArch64::EOR_PPzPP: 2508 return AArch64::EORS_PPzPP; 2509 case AArch64::NAND_PPzPP: 2510 return AArch64::NANDS_PPzPP; 2511 case AArch64::NOR_PPzPP: 2512 return AArch64::NORS_PPzPP; 2513 case AArch64::ORN_PPzPP: 2514 return AArch64::ORNS_PPzPP; 2515 case AArch64::ORR_PPzPP: 2516 return AArch64::ORRS_PPzPP; 2517 case AArch64::BRKA_PPzP: 2518 return AArch64::BRKAS_PPzP; 2519 case AArch64::BRKPA_PPzPP: 2520 return AArch64::BRKPAS_PPzPP; 2521 case AArch64::BRKB_PPzP: 2522 return AArch64::BRKBS_PPzP; 2523 case AArch64::BRKPB_PPzPP: 2524 return AArch64::BRKPBS_PPzPP; 2525 case AArch64::BRKN_PPzP: 2526 return AArch64::BRKNS_PPzP; 2527 case AArch64::RDFFR_PPz: 2528 return AArch64::RDFFRS_PPz; 2529 case AArch64::PTRUE_B: 2530 return AArch64::PTRUES_B; 2531 } 2532 } 2533 2534 // Is this a candidate for ld/st merging or pairing? For example, we don't 2535 // touch volatiles or load/stores that have a hint to avoid pair formation. 2536 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2537 2538 bool IsPreLdSt = isPreLdSt(MI); 2539 2540 // If this is a volatile load/store, don't mess with it. 2541 if (MI.hasOrderedMemoryRef()) 2542 return false; 2543 2544 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2545 // For Pre-inc LD/ST, the operand is shifted by one. 2546 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2547 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2548 "Expected a reg or frame index operand."); 2549 2550 // For Pre-indexed addressing quadword instructions, the third operand is the 2551 // immediate value. 2552 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2553 2554 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2555 return false; 2556 2557 // Can't merge/pair if the instruction modifies the base register. 2558 // e.g., ldr x0, [x0] 2559 // This case will never occur with an FI base. 2560 // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged. 2561 // For example: 2562 // ldr q0, [x11, #32]! 2563 // ldr q1, [x11, #16] 2564 // to 2565 // ldp q0, q1, [x11, #32]! 2566 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2567 Register BaseReg = MI.getOperand(1).getReg(); 2568 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2569 if (MI.modifiesRegister(BaseReg, TRI)) 2570 return false; 2571 } 2572 2573 // Check if this load/store has a hint to avoid pair formation. 2574 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2575 if (isLdStPairSuppressed(MI)) 2576 return false; 2577 2578 // Do not pair any callee-save store/reload instructions in the 2579 // prologue/epilogue if the CFI information encoded the operations as separate 2580 // instructions, as that will cause the size of the actual prologue to mismatch 2581 // with the prologue size recorded in the Windows CFI. 2582 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2583 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2584 MI.getMF()->getFunction().needsUnwindTableEntry(); 2585 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2586 MI.getFlag(MachineInstr::FrameDestroy))) 2587 return false; 2588 2589 // On some CPUs quad load/store pairs are slower than two single load/stores. 2590 if (Subtarget.isPaired128Slow()) { 2591 switch (MI.getOpcode()) { 2592 default: 2593 break; 2594 case AArch64::LDURQi: 2595 case AArch64::STURQi: 2596 case AArch64::LDRQui: 2597 case AArch64::STRQui: 2598 return false; 2599 } 2600 } 2601 2602 return true; 2603 } 2604 2605 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2606 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2607 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2608 const TargetRegisterInfo *TRI) const { 2609 if (!LdSt.mayLoadOrStore()) 2610 return false; 2611 2612 const MachineOperand *BaseOp; 2613 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2614 Width, TRI)) 2615 return false; 2616 BaseOps.push_back(BaseOp); 2617 return true; 2618 } 2619 2620 std::optional<ExtAddrMode> 2621 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2622 const TargetRegisterInfo *TRI) const { 2623 const MachineOperand *Base; // Filled with the base operand of MI. 2624 int64_t Offset; // Filled with the offset of MI. 2625 bool OffsetIsScalable; 2626 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2627 return std::nullopt; 2628 2629 if (!Base->isReg()) 2630 return std::nullopt; 2631 ExtAddrMode AM; 2632 AM.BaseReg = Base->getReg(); 2633 AM.Displacement = Offset; 2634 AM.ScaledReg = 0; 2635 AM.Scale = 0; 2636 return AM; 2637 } 2638 2639 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2640 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2641 bool &OffsetIsScalable, unsigned &Width, 2642 const TargetRegisterInfo *TRI) const { 2643 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2644 // Handle only loads/stores with base register followed by immediate offset. 2645 if (LdSt.getNumExplicitOperands() == 3) { 2646 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2647 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2648 !LdSt.getOperand(2).isImm()) 2649 return false; 2650 } else if (LdSt.getNumExplicitOperands() == 4) { 2651 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2652 if (!LdSt.getOperand(1).isReg() || 2653 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2654 !LdSt.getOperand(3).isImm()) 2655 return false; 2656 } else 2657 return false; 2658 2659 // Get the scaling factor for the instruction and set the width for the 2660 // instruction. 2661 TypeSize Scale(0U, false); 2662 int64_t Dummy1, Dummy2; 2663 2664 // If this returns false, then it's an instruction we don't want to handle. 2665 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2666 return false; 2667 2668 // Compute the offset. Offset is calculated as the immediate operand 2669 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2670 // set to 1. 2671 if (LdSt.getNumExplicitOperands() == 3) { 2672 BaseOp = &LdSt.getOperand(1); 2673 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 2674 } else { 2675 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2676 BaseOp = &LdSt.getOperand(2); 2677 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 2678 } 2679 OffsetIsScalable = Scale.isScalable(); 2680 2681 if (!BaseOp->isReg() && !BaseOp->isFI()) 2682 return false; 2683 2684 return true; 2685 } 2686 2687 MachineOperand & 2688 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2689 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2690 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2691 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2692 return OfsOp; 2693 } 2694 2695 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2696 unsigned &Width, int64_t &MinOffset, 2697 int64_t &MaxOffset) { 2698 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2699 switch (Opcode) { 2700 // Not a memory operation or something we want to handle. 2701 default: 2702 Scale = TypeSize::Fixed(0); 2703 Width = 0; 2704 MinOffset = MaxOffset = 0; 2705 return false; 2706 case AArch64::STRWpost: 2707 case AArch64::LDRWpost: 2708 Width = 32; 2709 Scale = TypeSize::Fixed(4); 2710 MinOffset = -256; 2711 MaxOffset = 255; 2712 break; 2713 case AArch64::LDURQi: 2714 case AArch64::STURQi: 2715 Width = 16; 2716 Scale = TypeSize::Fixed(1); 2717 MinOffset = -256; 2718 MaxOffset = 255; 2719 break; 2720 case AArch64::PRFUMi: 2721 case AArch64::LDURXi: 2722 case AArch64::LDURDi: 2723 case AArch64::STURXi: 2724 case AArch64::STURDi: 2725 Width = 8; 2726 Scale = TypeSize::Fixed(1); 2727 MinOffset = -256; 2728 MaxOffset = 255; 2729 break; 2730 case AArch64::LDURWi: 2731 case AArch64::LDURSi: 2732 case AArch64::LDURSWi: 2733 case AArch64::STURWi: 2734 case AArch64::STURSi: 2735 Width = 4; 2736 Scale = TypeSize::Fixed(1); 2737 MinOffset = -256; 2738 MaxOffset = 255; 2739 break; 2740 case AArch64::LDURHi: 2741 case AArch64::LDURHHi: 2742 case AArch64::LDURSHXi: 2743 case AArch64::LDURSHWi: 2744 case AArch64::STURHi: 2745 case AArch64::STURHHi: 2746 Width = 2; 2747 Scale = TypeSize::Fixed(1); 2748 MinOffset = -256; 2749 MaxOffset = 255; 2750 break; 2751 case AArch64::LDURBi: 2752 case AArch64::LDURBBi: 2753 case AArch64::LDURSBXi: 2754 case AArch64::LDURSBWi: 2755 case AArch64::STURBi: 2756 case AArch64::STURBBi: 2757 Width = 1; 2758 Scale = TypeSize::Fixed(1); 2759 MinOffset = -256; 2760 MaxOffset = 255; 2761 break; 2762 case AArch64::LDPQi: 2763 case AArch64::LDNPQi: 2764 case AArch64::STPQi: 2765 case AArch64::STNPQi: 2766 Scale = TypeSize::Fixed(16); 2767 Width = 32; 2768 MinOffset = -64; 2769 MaxOffset = 63; 2770 break; 2771 case AArch64::LDRQui: 2772 case AArch64::STRQui: 2773 Scale = TypeSize::Fixed(16); 2774 Width = 16; 2775 MinOffset = 0; 2776 MaxOffset = 4095; 2777 break; 2778 case AArch64::LDPXi: 2779 case AArch64::LDPDi: 2780 case AArch64::LDNPXi: 2781 case AArch64::LDNPDi: 2782 case AArch64::STPXi: 2783 case AArch64::STPDi: 2784 case AArch64::STNPXi: 2785 case AArch64::STNPDi: 2786 Scale = TypeSize::Fixed(8); 2787 Width = 16; 2788 MinOffset = -64; 2789 MaxOffset = 63; 2790 break; 2791 case AArch64::PRFMui: 2792 case AArch64::LDRXui: 2793 case AArch64::LDRDui: 2794 case AArch64::STRXui: 2795 case AArch64::STRDui: 2796 Scale = TypeSize::Fixed(8); 2797 Width = 8; 2798 MinOffset = 0; 2799 MaxOffset = 4095; 2800 break; 2801 case AArch64::StoreSwiftAsyncContext: 2802 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2803 Scale = TypeSize::Fixed(1); 2804 Width = 8; 2805 MinOffset = 0; 2806 MaxOffset = 4095; 2807 break; 2808 case AArch64::LDPWi: 2809 case AArch64::LDPSi: 2810 case AArch64::LDNPWi: 2811 case AArch64::LDNPSi: 2812 case AArch64::STPWi: 2813 case AArch64::STPSi: 2814 case AArch64::STNPWi: 2815 case AArch64::STNPSi: 2816 Scale = TypeSize::Fixed(4); 2817 Width = 8; 2818 MinOffset = -64; 2819 MaxOffset = 63; 2820 break; 2821 case AArch64::LDRWui: 2822 case AArch64::LDRSui: 2823 case AArch64::LDRSWui: 2824 case AArch64::STRWui: 2825 case AArch64::STRSui: 2826 Scale = TypeSize::Fixed(4); 2827 Width = 4; 2828 MinOffset = 0; 2829 MaxOffset = 4095; 2830 break; 2831 case AArch64::LDRHui: 2832 case AArch64::LDRHHui: 2833 case AArch64::LDRSHWui: 2834 case AArch64::LDRSHXui: 2835 case AArch64::STRHui: 2836 case AArch64::STRHHui: 2837 Scale = TypeSize::Fixed(2); 2838 Width = 2; 2839 MinOffset = 0; 2840 MaxOffset = 4095; 2841 break; 2842 case AArch64::LDRBui: 2843 case AArch64::LDRBBui: 2844 case AArch64::LDRSBWui: 2845 case AArch64::LDRSBXui: 2846 case AArch64::STRBui: 2847 case AArch64::STRBBui: 2848 Scale = TypeSize::Fixed(1); 2849 Width = 1; 2850 MinOffset = 0; 2851 MaxOffset = 4095; 2852 break; 2853 case AArch64::STPXpre: 2854 case AArch64::LDPXpost: 2855 case AArch64::STPDpre: 2856 case AArch64::LDPDpost: 2857 Scale = TypeSize::Fixed(8); 2858 Width = 8; 2859 MinOffset = -512; 2860 MaxOffset = 504; 2861 break; 2862 case AArch64::STPQpre: 2863 case AArch64::LDPQpost: 2864 Scale = TypeSize::Fixed(16); 2865 Width = 16; 2866 MinOffset = -1024; 2867 MaxOffset = 1008; 2868 break; 2869 case AArch64::STRXpre: 2870 case AArch64::STRDpre: 2871 case AArch64::LDRXpost: 2872 case AArch64::LDRDpost: 2873 Scale = TypeSize::Fixed(1); 2874 Width = 8; 2875 MinOffset = -256; 2876 MaxOffset = 255; 2877 break; 2878 case AArch64::STRQpre: 2879 case AArch64::LDRQpost: 2880 Scale = TypeSize::Fixed(1); 2881 Width = 16; 2882 MinOffset = -256; 2883 MaxOffset = 255; 2884 break; 2885 case AArch64::ADDG: 2886 Scale = TypeSize::Fixed(16); 2887 Width = 0; 2888 MinOffset = 0; 2889 MaxOffset = 63; 2890 break; 2891 case AArch64::TAGPstack: 2892 Scale = TypeSize::Fixed(16); 2893 Width = 0; 2894 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2895 // of 63 (not 64!). 2896 MinOffset = -63; 2897 MaxOffset = 63; 2898 break; 2899 case AArch64::LDG: 2900 case AArch64::STGi: 2901 case AArch64::STZGi: 2902 Scale = TypeSize::Fixed(16); 2903 Width = 16; 2904 MinOffset = -256; 2905 MaxOffset = 255; 2906 break; 2907 case AArch64::STR_ZZZZXI: 2908 case AArch64::LDR_ZZZZXI: 2909 Scale = TypeSize::Scalable(16); 2910 Width = SVEMaxBytesPerVector * 4; 2911 MinOffset = -256; 2912 MaxOffset = 252; 2913 break; 2914 case AArch64::STR_ZZZXI: 2915 case AArch64::LDR_ZZZXI: 2916 Scale = TypeSize::Scalable(16); 2917 Width = SVEMaxBytesPerVector * 3; 2918 MinOffset = -256; 2919 MaxOffset = 253; 2920 break; 2921 case AArch64::STR_ZZXI: 2922 case AArch64::LDR_ZZXI: 2923 Scale = TypeSize::Scalable(16); 2924 Width = SVEMaxBytesPerVector * 2; 2925 MinOffset = -256; 2926 MaxOffset = 254; 2927 break; 2928 case AArch64::LDR_PXI: 2929 case AArch64::STR_PXI: 2930 Scale = TypeSize::Scalable(2); 2931 Width = SVEMaxBytesPerVector / 8; 2932 MinOffset = -256; 2933 MaxOffset = 255; 2934 break; 2935 case AArch64::LDR_ZXI: 2936 case AArch64::STR_ZXI: 2937 Scale = TypeSize::Scalable(16); 2938 Width = SVEMaxBytesPerVector; 2939 MinOffset = -256; 2940 MaxOffset = 255; 2941 break; 2942 case AArch64::LD1B_IMM: 2943 case AArch64::LD1H_IMM: 2944 case AArch64::LD1W_IMM: 2945 case AArch64::LD1D_IMM: 2946 case AArch64::LDNT1B_ZRI: 2947 case AArch64::LDNT1H_ZRI: 2948 case AArch64::LDNT1W_ZRI: 2949 case AArch64::LDNT1D_ZRI: 2950 case AArch64::ST1B_IMM: 2951 case AArch64::ST1H_IMM: 2952 case AArch64::ST1W_IMM: 2953 case AArch64::ST1D_IMM: 2954 case AArch64::STNT1B_ZRI: 2955 case AArch64::STNT1H_ZRI: 2956 case AArch64::STNT1W_ZRI: 2957 case AArch64::STNT1D_ZRI: 2958 case AArch64::LDNF1B_IMM: 2959 case AArch64::LDNF1H_IMM: 2960 case AArch64::LDNF1W_IMM: 2961 case AArch64::LDNF1D_IMM: 2962 // A full vectors worth of data 2963 // Width = mbytes * elements 2964 Scale = TypeSize::Scalable(16); 2965 Width = SVEMaxBytesPerVector; 2966 MinOffset = -8; 2967 MaxOffset = 7; 2968 break; 2969 case AArch64::LD2B_IMM: 2970 case AArch64::LD2H_IMM: 2971 case AArch64::LD2W_IMM: 2972 case AArch64::LD2D_IMM: 2973 case AArch64::ST2B_IMM: 2974 case AArch64::ST2H_IMM: 2975 case AArch64::ST2W_IMM: 2976 case AArch64::ST2D_IMM: 2977 Scale = TypeSize::Scalable(32); 2978 Width = SVEMaxBytesPerVector * 2; 2979 MinOffset = -8; 2980 MaxOffset = 7; 2981 break; 2982 case AArch64::LD3B_IMM: 2983 case AArch64::LD3H_IMM: 2984 case AArch64::LD3W_IMM: 2985 case AArch64::LD3D_IMM: 2986 case AArch64::ST3B_IMM: 2987 case AArch64::ST3H_IMM: 2988 case AArch64::ST3W_IMM: 2989 case AArch64::ST3D_IMM: 2990 Scale = TypeSize::Scalable(48); 2991 Width = SVEMaxBytesPerVector * 3; 2992 MinOffset = -8; 2993 MaxOffset = 7; 2994 break; 2995 case AArch64::LD4B_IMM: 2996 case AArch64::LD4H_IMM: 2997 case AArch64::LD4W_IMM: 2998 case AArch64::LD4D_IMM: 2999 case AArch64::ST4B_IMM: 3000 case AArch64::ST4H_IMM: 3001 case AArch64::ST4W_IMM: 3002 case AArch64::ST4D_IMM: 3003 Scale = TypeSize::Scalable(64); 3004 Width = SVEMaxBytesPerVector * 4; 3005 MinOffset = -8; 3006 MaxOffset = 7; 3007 break; 3008 case AArch64::LD1B_H_IMM: 3009 case AArch64::LD1SB_H_IMM: 3010 case AArch64::LD1H_S_IMM: 3011 case AArch64::LD1SH_S_IMM: 3012 case AArch64::LD1W_D_IMM: 3013 case AArch64::LD1SW_D_IMM: 3014 case AArch64::ST1B_H_IMM: 3015 case AArch64::ST1H_S_IMM: 3016 case AArch64::ST1W_D_IMM: 3017 case AArch64::LDNF1B_H_IMM: 3018 case AArch64::LDNF1SB_H_IMM: 3019 case AArch64::LDNF1H_S_IMM: 3020 case AArch64::LDNF1SH_S_IMM: 3021 case AArch64::LDNF1W_D_IMM: 3022 case AArch64::LDNF1SW_D_IMM: 3023 // A half vector worth of data 3024 // Width = mbytes * elements 3025 Scale = TypeSize::Scalable(8); 3026 Width = SVEMaxBytesPerVector / 2; 3027 MinOffset = -8; 3028 MaxOffset = 7; 3029 break; 3030 case AArch64::LD1B_S_IMM: 3031 case AArch64::LD1SB_S_IMM: 3032 case AArch64::LD1H_D_IMM: 3033 case AArch64::LD1SH_D_IMM: 3034 case AArch64::ST1B_S_IMM: 3035 case AArch64::ST1H_D_IMM: 3036 case AArch64::LDNF1B_S_IMM: 3037 case AArch64::LDNF1SB_S_IMM: 3038 case AArch64::LDNF1H_D_IMM: 3039 case AArch64::LDNF1SH_D_IMM: 3040 // A quarter vector worth of data 3041 // Width = mbytes * elements 3042 Scale = TypeSize::Scalable(4); 3043 Width = SVEMaxBytesPerVector / 4; 3044 MinOffset = -8; 3045 MaxOffset = 7; 3046 break; 3047 case AArch64::LD1B_D_IMM: 3048 case AArch64::LD1SB_D_IMM: 3049 case AArch64::ST1B_D_IMM: 3050 case AArch64::LDNF1B_D_IMM: 3051 case AArch64::LDNF1SB_D_IMM: 3052 // A eighth vector worth of data 3053 // Width = mbytes * elements 3054 Scale = TypeSize::Scalable(2); 3055 Width = SVEMaxBytesPerVector / 8; 3056 MinOffset = -8; 3057 MaxOffset = 7; 3058 break; 3059 case AArch64::ST2Gi: 3060 case AArch64::STZ2Gi: 3061 Scale = TypeSize::Fixed(16); 3062 Width = 32; 3063 MinOffset = -256; 3064 MaxOffset = 255; 3065 break; 3066 case AArch64::STGPi: 3067 Scale = TypeSize::Fixed(16); 3068 Width = 16; 3069 MinOffset = -64; 3070 MaxOffset = 63; 3071 break; 3072 case AArch64::LD1RB_IMM: 3073 case AArch64::LD1RB_H_IMM: 3074 case AArch64::LD1RB_S_IMM: 3075 case AArch64::LD1RB_D_IMM: 3076 case AArch64::LD1RSB_H_IMM: 3077 case AArch64::LD1RSB_S_IMM: 3078 case AArch64::LD1RSB_D_IMM: 3079 Scale = TypeSize::Fixed(1); 3080 Width = 1; 3081 MinOffset = 0; 3082 MaxOffset = 63; 3083 break; 3084 case AArch64::LD1RH_IMM: 3085 case AArch64::LD1RH_S_IMM: 3086 case AArch64::LD1RH_D_IMM: 3087 case AArch64::LD1RSH_S_IMM: 3088 case AArch64::LD1RSH_D_IMM: 3089 Scale = TypeSize::Fixed(2); 3090 Width = 2; 3091 MinOffset = 0; 3092 MaxOffset = 63; 3093 break; 3094 case AArch64::LD1RW_IMM: 3095 case AArch64::LD1RW_D_IMM: 3096 case AArch64::LD1RSW_IMM: 3097 Scale = TypeSize::Fixed(4); 3098 Width = 4; 3099 MinOffset = 0; 3100 MaxOffset = 63; 3101 break; 3102 case AArch64::LD1RD_IMM: 3103 Scale = TypeSize::Fixed(8); 3104 Width = 8; 3105 MinOffset = 0; 3106 MaxOffset = 63; 3107 break; 3108 } 3109 3110 return true; 3111 } 3112 3113 // Scaling factor for unscaled load or store. 3114 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3115 switch (Opc) { 3116 default: 3117 llvm_unreachable("Opcode has unknown scale!"); 3118 case AArch64::LDRBBui: 3119 case AArch64::LDURBBi: 3120 case AArch64::LDRSBWui: 3121 case AArch64::LDURSBWi: 3122 case AArch64::STRBBui: 3123 case AArch64::STURBBi: 3124 return 1; 3125 case AArch64::LDRHHui: 3126 case AArch64::LDURHHi: 3127 case AArch64::LDRSHWui: 3128 case AArch64::LDURSHWi: 3129 case AArch64::STRHHui: 3130 case AArch64::STURHHi: 3131 return 2; 3132 case AArch64::LDRSui: 3133 case AArch64::LDURSi: 3134 case AArch64::LDRSpre: 3135 case AArch64::LDRSWui: 3136 case AArch64::LDURSWi: 3137 case AArch64::LDRWpre: 3138 case AArch64::LDRWui: 3139 case AArch64::LDURWi: 3140 case AArch64::STRSui: 3141 case AArch64::STURSi: 3142 case AArch64::STRSpre: 3143 case AArch64::STRWui: 3144 case AArch64::STURWi: 3145 case AArch64::STRWpre: 3146 case AArch64::LDPSi: 3147 case AArch64::LDPSWi: 3148 case AArch64::LDPWi: 3149 case AArch64::STPSi: 3150 case AArch64::STPWi: 3151 return 4; 3152 case AArch64::LDRDui: 3153 case AArch64::LDURDi: 3154 case AArch64::LDRDpre: 3155 case AArch64::LDRXui: 3156 case AArch64::LDURXi: 3157 case AArch64::LDRXpre: 3158 case AArch64::STRDui: 3159 case AArch64::STURDi: 3160 case AArch64::STRDpre: 3161 case AArch64::STRXui: 3162 case AArch64::STURXi: 3163 case AArch64::STRXpre: 3164 case AArch64::LDPDi: 3165 case AArch64::LDPXi: 3166 case AArch64::STPDi: 3167 case AArch64::STPXi: 3168 return 8; 3169 case AArch64::LDRQui: 3170 case AArch64::LDURQi: 3171 case AArch64::STRQui: 3172 case AArch64::STURQi: 3173 case AArch64::STRQpre: 3174 case AArch64::LDPQi: 3175 case AArch64::LDRQpre: 3176 case AArch64::STPQi: 3177 case AArch64::STGi: 3178 case AArch64::STZGi: 3179 case AArch64::ST2Gi: 3180 case AArch64::STZ2Gi: 3181 case AArch64::STGPi: 3182 return 16; 3183 } 3184 } 3185 3186 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 3187 switch (MI.getOpcode()) { 3188 default: 3189 return false; 3190 case AArch64::LDRWpre: 3191 case AArch64::LDRXpre: 3192 case AArch64::LDRSpre: 3193 case AArch64::LDRDpre: 3194 case AArch64::LDRQpre: 3195 return true; 3196 } 3197 } 3198 3199 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3200 switch (MI.getOpcode()) { 3201 default: 3202 return false; 3203 case AArch64::STRWpre: 3204 case AArch64::STRXpre: 3205 case AArch64::STRSpre: 3206 case AArch64::STRDpre: 3207 case AArch64::STRQpre: 3208 return true; 3209 } 3210 } 3211 3212 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3213 return isPreLd(MI) || isPreSt(MI); 3214 } 3215 3216 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 3217 switch (MI.getOpcode()) { 3218 default: 3219 return false; 3220 case AArch64::LDPSi: 3221 case AArch64::LDPSWi: 3222 case AArch64::LDPDi: 3223 case AArch64::LDPQi: 3224 case AArch64::LDPWi: 3225 case AArch64::LDPXi: 3226 case AArch64::STPSi: 3227 case AArch64::STPDi: 3228 case AArch64::STPQi: 3229 case AArch64::STPWi: 3230 case AArch64::STPXi: 3231 case AArch64::STGPi: 3232 return true; 3233 } 3234 } 3235 3236 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 3237 unsigned Idx = 3238 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 3239 : 1; 3240 return MI.getOperand(Idx); 3241 } 3242 3243 const MachineOperand & 3244 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 3245 unsigned Idx = 3246 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 3247 : 2; 3248 return MI.getOperand(Idx); 3249 } 3250 3251 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 3252 Register Reg) { 3253 if (MI.getParent() == nullptr) 3254 return nullptr; 3255 const MachineFunction *MF = MI.getParent()->getParent(); 3256 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 3257 } 3258 3259 bool AArch64InstrInfo::isHForm(const MachineInstr &MI) { 3260 auto IsHFPR = [&](const MachineOperand &Op) { 3261 if (!Op.isReg()) 3262 return false; 3263 auto Reg = Op.getReg(); 3264 if (Reg.isPhysical()) 3265 return AArch64::FPR16RegClass.contains(Reg); 3266 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3267 return TRC == &AArch64::FPR16RegClass || 3268 TRC == &AArch64::FPR16_loRegClass; 3269 }; 3270 return llvm::any_of(MI.operands(), IsHFPR); 3271 } 3272 3273 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 3274 auto IsQFPR = [&](const MachineOperand &Op) { 3275 if (!Op.isReg()) 3276 return false; 3277 auto Reg = Op.getReg(); 3278 if (Reg.isPhysical()) 3279 return AArch64::FPR128RegClass.contains(Reg); 3280 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3281 return TRC == &AArch64::FPR128RegClass || 3282 TRC == &AArch64::FPR128_loRegClass; 3283 }; 3284 return llvm::any_of(MI.operands(), IsQFPR); 3285 } 3286 3287 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 3288 auto IsFPR = [&](const MachineOperand &Op) { 3289 if (!Op.isReg()) 3290 return false; 3291 auto Reg = Op.getReg(); 3292 if (Reg.isPhysical()) 3293 return AArch64::FPR128RegClass.contains(Reg) || 3294 AArch64::FPR64RegClass.contains(Reg) || 3295 AArch64::FPR32RegClass.contains(Reg) || 3296 AArch64::FPR16RegClass.contains(Reg) || 3297 AArch64::FPR8RegClass.contains(Reg); 3298 3299 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3300 return TRC == &AArch64::FPR128RegClass || 3301 TRC == &AArch64::FPR128_loRegClass || 3302 TRC == &AArch64::FPR64RegClass || 3303 TRC == &AArch64::FPR64_loRegClass || 3304 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 3305 TRC == &AArch64::FPR8RegClass; 3306 }; 3307 return llvm::any_of(MI.operands(), IsFPR); 3308 } 3309 3310 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3311 // scaled. 3312 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3313 int Scale = AArch64InstrInfo::getMemScale(Opc); 3314 3315 // If the byte-offset isn't a multiple of the stride, we can't scale this 3316 // offset. 3317 if (Offset % Scale != 0) 3318 return false; 3319 3320 // Convert the byte-offset used by unscaled into an "element" offset used 3321 // by the scaled pair load/store instructions. 3322 Offset /= Scale; 3323 return true; 3324 } 3325 3326 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3327 if (FirstOpc == SecondOpc) 3328 return true; 3329 // We can also pair sign-ext and zero-ext instructions. 3330 switch (FirstOpc) { 3331 default: 3332 return false; 3333 case AArch64::LDRWui: 3334 case AArch64::LDURWi: 3335 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3336 case AArch64::LDRSWui: 3337 case AArch64::LDURSWi: 3338 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3339 } 3340 // These instructions can't be paired based on their opcodes. 3341 return false; 3342 } 3343 3344 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3345 int64_t Offset1, unsigned Opcode1, int FI2, 3346 int64_t Offset2, unsigned Opcode2) { 3347 // Accesses through fixed stack object frame indices may access a different 3348 // fixed stack slot. Check that the object offsets + offsets match. 3349 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3350 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3351 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3352 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3353 // Convert to scaled object offsets. 3354 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3355 if (ObjectOffset1 % Scale1 != 0) 3356 return false; 3357 ObjectOffset1 /= Scale1; 3358 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3359 if (ObjectOffset2 % Scale2 != 0) 3360 return false; 3361 ObjectOffset2 /= Scale2; 3362 ObjectOffset1 += Offset1; 3363 ObjectOffset2 += Offset2; 3364 return ObjectOffset1 + 1 == ObjectOffset2; 3365 } 3366 3367 return FI1 == FI2; 3368 } 3369 3370 /// Detect opportunities for ldp/stp formation. 3371 /// 3372 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3373 bool AArch64InstrInfo::shouldClusterMemOps( 3374 ArrayRef<const MachineOperand *> BaseOps1, 3375 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3376 unsigned NumBytes) const { 3377 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3378 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3379 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3380 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3381 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3382 if (BaseOp1.getType() != BaseOp2.getType()) 3383 return false; 3384 3385 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3386 "Only base registers and frame indices are supported."); 3387 3388 // Check for both base regs and base FI. 3389 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3390 return false; 3391 3392 // Only cluster up to a single pair. 3393 if (NumLoads > 2) 3394 return false; 3395 3396 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3397 return false; 3398 3399 // Can we pair these instructions based on their opcodes? 3400 unsigned FirstOpc = FirstLdSt.getOpcode(); 3401 unsigned SecondOpc = SecondLdSt.getOpcode(); 3402 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3403 return false; 3404 3405 // Can't merge volatiles or load/stores that have a hint to avoid pair 3406 // formation, for example. 3407 if (!isCandidateToMergeOrPair(FirstLdSt) || 3408 !isCandidateToMergeOrPair(SecondLdSt)) 3409 return false; 3410 3411 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3412 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3413 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3414 return false; 3415 3416 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3417 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3418 return false; 3419 3420 // Pairwise instructions have a 7-bit signed offset field. 3421 if (Offset1 > 63 || Offset1 < -64) 3422 return false; 3423 3424 // The caller should already have ordered First/SecondLdSt by offset. 3425 // Note: except for non-equal frame index bases 3426 if (BaseOp1.isFI()) { 3427 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3428 "Caller should have ordered offsets."); 3429 3430 const MachineFrameInfo &MFI = 3431 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3432 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3433 BaseOp2.getIndex(), Offset2, SecondOpc); 3434 } 3435 3436 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3437 3438 return Offset1 + 1 == Offset2; 3439 } 3440 3441 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3442 unsigned Reg, unsigned SubIdx, 3443 unsigned State, 3444 const TargetRegisterInfo *TRI) { 3445 if (!SubIdx) 3446 return MIB.addReg(Reg, State); 3447 3448 if (Register::isPhysicalRegister(Reg)) 3449 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3450 return MIB.addReg(Reg, State, SubIdx); 3451 } 3452 3453 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3454 unsigned NumRegs) { 3455 // We really want the positive remainder mod 32 here, that happens to be 3456 // easily obtainable with a mask. 3457 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3458 } 3459 3460 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3461 MachineBasicBlock::iterator I, 3462 const DebugLoc &DL, MCRegister DestReg, 3463 MCRegister SrcReg, bool KillSrc, 3464 unsigned Opcode, 3465 ArrayRef<unsigned> Indices) const { 3466 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3467 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3468 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3469 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3470 unsigned NumRegs = Indices.size(); 3471 3472 int SubReg = 0, End = NumRegs, Incr = 1; 3473 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3474 SubReg = NumRegs - 1; 3475 End = -1; 3476 Incr = -1; 3477 } 3478 3479 for (; SubReg != End; SubReg += Incr) { 3480 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3481 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3482 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3483 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3484 } 3485 } 3486 3487 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3488 MachineBasicBlock::iterator I, 3489 DebugLoc DL, unsigned DestReg, 3490 unsigned SrcReg, bool KillSrc, 3491 unsigned Opcode, unsigned ZeroReg, 3492 llvm::ArrayRef<unsigned> Indices) const { 3493 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3494 unsigned NumRegs = Indices.size(); 3495 3496 #ifndef NDEBUG 3497 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3498 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3499 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3500 "GPR reg sequences should not be able to overlap"); 3501 #endif 3502 3503 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3504 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3505 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3506 MIB.addReg(ZeroReg); 3507 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3508 MIB.addImm(0); 3509 } 3510 } 3511 3512 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3513 MachineBasicBlock::iterator I, 3514 const DebugLoc &DL, MCRegister DestReg, 3515 MCRegister SrcReg, bool KillSrc) const { 3516 if (AArch64::GPR32spRegClass.contains(DestReg) && 3517 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3518 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3519 3520 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3521 // If either operand is WSP, expand to ADD #0. 3522 if (Subtarget.hasZeroCycleRegMove()) { 3523 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3524 MCRegister DestRegX = TRI->getMatchingSuperReg( 3525 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3526 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3527 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3528 // This instruction is reading and writing X registers. This may upset 3529 // the register scavenger and machine verifier, so we need to indicate 3530 // that we are reading an undefined value from SrcRegX, but a proper 3531 // value from SrcReg. 3532 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3533 .addReg(SrcRegX, RegState::Undef) 3534 .addImm(0) 3535 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3536 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3537 } else { 3538 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3539 .addReg(SrcReg, getKillRegState(KillSrc)) 3540 .addImm(0) 3541 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3542 } 3543 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3544 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3545 .addImm(0) 3546 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3547 } else { 3548 if (Subtarget.hasZeroCycleRegMove()) { 3549 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3550 MCRegister DestRegX = TRI->getMatchingSuperReg( 3551 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3552 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3553 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3554 // This instruction is reading and writing X registers. This may upset 3555 // the register scavenger and machine verifier, so we need to indicate 3556 // that we are reading an undefined value from SrcRegX, but a proper 3557 // value from SrcReg. 3558 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3559 .addReg(AArch64::XZR) 3560 .addReg(SrcRegX, RegState::Undef) 3561 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3562 } else { 3563 // Otherwise, expand to ORR WZR. 3564 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3565 .addReg(AArch64::WZR) 3566 .addReg(SrcReg, getKillRegState(KillSrc)); 3567 } 3568 } 3569 return; 3570 } 3571 3572 // Copy a Predicate register by ORRing with itself. 3573 if (AArch64::PPRRegClass.contains(DestReg) && 3574 AArch64::PPRRegClass.contains(SrcReg)) { 3575 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3576 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3577 .addReg(SrcReg) // Pg 3578 .addReg(SrcReg) 3579 .addReg(SrcReg, getKillRegState(KillSrc)); 3580 return; 3581 } 3582 3583 // Copy a Z register by ORRing with itself. 3584 if (AArch64::ZPRRegClass.contains(DestReg) && 3585 AArch64::ZPRRegClass.contains(SrcReg)) { 3586 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3587 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3588 .addReg(SrcReg) 3589 .addReg(SrcReg, getKillRegState(KillSrc)); 3590 return; 3591 } 3592 3593 // Copy a Z register pair by copying the individual sub-registers. 3594 if (AArch64::ZPR2RegClass.contains(DestReg) && 3595 AArch64::ZPR2RegClass.contains(SrcReg)) { 3596 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3597 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3598 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3599 Indices); 3600 return; 3601 } 3602 3603 // Copy a Z register triple by copying the individual sub-registers. 3604 if (AArch64::ZPR3RegClass.contains(DestReg) && 3605 AArch64::ZPR3RegClass.contains(SrcReg)) { 3606 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3607 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3608 AArch64::zsub2}; 3609 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3610 Indices); 3611 return; 3612 } 3613 3614 // Copy a Z register quad by copying the individual sub-registers. 3615 if (AArch64::ZPR4RegClass.contains(DestReg) && 3616 AArch64::ZPR4RegClass.contains(SrcReg)) { 3617 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3618 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3619 AArch64::zsub2, AArch64::zsub3}; 3620 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3621 Indices); 3622 return; 3623 } 3624 3625 if (AArch64::GPR64spRegClass.contains(DestReg) && 3626 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3627 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3628 // If either operand is SP, expand to ADD #0. 3629 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3630 .addReg(SrcReg, getKillRegState(KillSrc)) 3631 .addImm(0) 3632 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3633 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3634 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3635 .addImm(0) 3636 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3637 } else { 3638 // Otherwise, expand to ORR XZR. 3639 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3640 .addReg(AArch64::XZR) 3641 .addReg(SrcReg, getKillRegState(KillSrc)); 3642 } 3643 return; 3644 } 3645 3646 // Copy a DDDD register quad by copying the individual sub-registers. 3647 if (AArch64::DDDDRegClass.contains(DestReg) && 3648 AArch64::DDDDRegClass.contains(SrcReg)) { 3649 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3650 AArch64::dsub2, AArch64::dsub3}; 3651 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3652 Indices); 3653 return; 3654 } 3655 3656 // Copy a DDD register triple by copying the individual sub-registers. 3657 if (AArch64::DDDRegClass.contains(DestReg) && 3658 AArch64::DDDRegClass.contains(SrcReg)) { 3659 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3660 AArch64::dsub2}; 3661 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3662 Indices); 3663 return; 3664 } 3665 3666 // Copy a DD register pair by copying the individual sub-registers. 3667 if (AArch64::DDRegClass.contains(DestReg) && 3668 AArch64::DDRegClass.contains(SrcReg)) { 3669 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3670 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3671 Indices); 3672 return; 3673 } 3674 3675 // Copy a QQQQ register quad by copying the individual sub-registers. 3676 if (AArch64::QQQQRegClass.contains(DestReg) && 3677 AArch64::QQQQRegClass.contains(SrcReg)) { 3678 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3679 AArch64::qsub2, AArch64::qsub3}; 3680 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3681 Indices); 3682 return; 3683 } 3684 3685 // Copy a QQQ register triple by copying the individual sub-registers. 3686 if (AArch64::QQQRegClass.contains(DestReg) && 3687 AArch64::QQQRegClass.contains(SrcReg)) { 3688 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3689 AArch64::qsub2}; 3690 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3691 Indices); 3692 return; 3693 } 3694 3695 // Copy a QQ register pair by copying the individual sub-registers. 3696 if (AArch64::QQRegClass.contains(DestReg) && 3697 AArch64::QQRegClass.contains(SrcReg)) { 3698 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3699 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3700 Indices); 3701 return; 3702 } 3703 3704 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3705 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3706 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3707 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3708 AArch64::XZR, Indices); 3709 return; 3710 } 3711 3712 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3713 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3714 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3715 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3716 AArch64::WZR, Indices); 3717 return; 3718 } 3719 3720 if (AArch64::FPR128RegClass.contains(DestReg) && 3721 AArch64::FPR128RegClass.contains(SrcReg)) { 3722 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable()) 3723 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 3724 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 3725 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 3726 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 3727 else if (Subtarget.hasNEON()) 3728 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3729 .addReg(SrcReg) 3730 .addReg(SrcReg, getKillRegState(KillSrc)); 3731 else { 3732 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3733 .addReg(AArch64::SP, RegState::Define) 3734 .addReg(SrcReg, getKillRegState(KillSrc)) 3735 .addReg(AArch64::SP) 3736 .addImm(-16); 3737 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3738 .addReg(AArch64::SP, RegState::Define) 3739 .addReg(DestReg, RegState::Define) 3740 .addReg(AArch64::SP) 3741 .addImm(16); 3742 } 3743 return; 3744 } 3745 3746 if (AArch64::FPR64RegClass.contains(DestReg) && 3747 AArch64::FPR64RegClass.contains(SrcReg)) { 3748 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3749 .addReg(SrcReg, getKillRegState(KillSrc)); 3750 return; 3751 } 3752 3753 if (AArch64::FPR32RegClass.contains(DestReg) && 3754 AArch64::FPR32RegClass.contains(SrcReg)) { 3755 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3756 .addReg(SrcReg, getKillRegState(KillSrc)); 3757 return; 3758 } 3759 3760 if (AArch64::FPR16RegClass.contains(DestReg) && 3761 AArch64::FPR16RegClass.contains(SrcReg)) { 3762 DestReg = 3763 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 3764 SrcReg = 3765 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 3766 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3767 .addReg(SrcReg, getKillRegState(KillSrc)); 3768 return; 3769 } 3770 3771 if (AArch64::FPR8RegClass.contains(DestReg) && 3772 AArch64::FPR8RegClass.contains(SrcReg)) { 3773 DestReg = 3774 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 3775 SrcReg = 3776 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 3777 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3778 .addReg(SrcReg, getKillRegState(KillSrc)); 3779 return; 3780 } 3781 3782 // Copies between GPR64 and FPR64. 3783 if (AArch64::FPR64RegClass.contains(DestReg) && 3784 AArch64::GPR64RegClass.contains(SrcReg)) { 3785 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3786 .addReg(SrcReg, getKillRegState(KillSrc)); 3787 return; 3788 } 3789 if (AArch64::GPR64RegClass.contains(DestReg) && 3790 AArch64::FPR64RegClass.contains(SrcReg)) { 3791 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3792 .addReg(SrcReg, getKillRegState(KillSrc)); 3793 return; 3794 } 3795 // Copies between GPR32 and FPR32. 3796 if (AArch64::FPR32RegClass.contains(DestReg) && 3797 AArch64::GPR32RegClass.contains(SrcReg)) { 3798 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3799 .addReg(SrcReg, getKillRegState(KillSrc)); 3800 return; 3801 } 3802 if (AArch64::GPR32RegClass.contains(DestReg) && 3803 AArch64::FPR32RegClass.contains(SrcReg)) { 3804 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3805 .addReg(SrcReg, getKillRegState(KillSrc)); 3806 return; 3807 } 3808 3809 if (DestReg == AArch64::NZCV) { 3810 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3811 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3812 .addImm(AArch64SysReg::NZCV) 3813 .addReg(SrcReg, getKillRegState(KillSrc)) 3814 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3815 return; 3816 } 3817 3818 if (SrcReg == AArch64::NZCV) { 3819 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3820 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3821 .addImm(AArch64SysReg::NZCV) 3822 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3823 return; 3824 } 3825 3826 #ifndef NDEBUG 3827 const TargetRegisterInfo &TRI = getRegisterInfo(); 3828 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 3829 << TRI.getRegAsmName(SrcReg) << "\n"; 3830 #endif 3831 llvm_unreachable("unimplemented reg-to-reg copy"); 3832 } 3833 3834 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3835 MachineBasicBlock &MBB, 3836 MachineBasicBlock::iterator InsertBefore, 3837 const MCInstrDesc &MCID, 3838 Register SrcReg, bool IsKill, 3839 unsigned SubIdx0, unsigned SubIdx1, int FI, 3840 MachineMemOperand *MMO) { 3841 Register SrcReg0 = SrcReg; 3842 Register SrcReg1 = SrcReg; 3843 if (SrcReg.isPhysical()) { 3844 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3845 SubIdx0 = 0; 3846 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3847 SubIdx1 = 0; 3848 } 3849 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3850 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3851 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3852 .addFrameIndex(FI) 3853 .addImm(0) 3854 .addMemOperand(MMO); 3855 } 3856 3857 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 3858 MachineBasicBlock::iterator MBBI, 3859 Register SrcReg, bool isKill, int FI, 3860 const TargetRegisterClass *RC, 3861 const TargetRegisterInfo *TRI, 3862 Register VReg) const { 3863 MachineFunction &MF = *MBB.getParent(); 3864 MachineFrameInfo &MFI = MF.getFrameInfo(); 3865 3866 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3867 MachineMemOperand *MMO = 3868 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3869 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3870 unsigned Opc = 0; 3871 bool Offset = true; 3872 unsigned StackID = TargetStackID::Default; 3873 switch (TRI->getSpillSize(*RC)) { 3874 case 1: 3875 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3876 Opc = AArch64::STRBui; 3877 break; 3878 case 2: 3879 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3880 Opc = AArch64::STRHui; 3881 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3882 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3883 Opc = AArch64::STR_PXI; 3884 StackID = TargetStackID::ScalableVector; 3885 } 3886 break; 3887 case 4: 3888 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3889 Opc = AArch64::STRWui; 3890 if (SrcReg.isVirtual()) 3891 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3892 else 3893 assert(SrcReg != AArch64::WSP); 3894 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3895 Opc = AArch64::STRSui; 3896 break; 3897 case 8: 3898 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3899 Opc = AArch64::STRXui; 3900 if (SrcReg.isVirtual()) 3901 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3902 else 3903 assert(SrcReg != AArch64::SP); 3904 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3905 Opc = AArch64::STRDui; 3906 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3907 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3908 get(AArch64::STPWi), SrcReg, isKill, 3909 AArch64::sube32, AArch64::subo32, FI, MMO); 3910 return; 3911 } 3912 break; 3913 case 16: 3914 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3915 Opc = AArch64::STRQui; 3916 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3917 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3918 Opc = AArch64::ST1Twov1d; 3919 Offset = false; 3920 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3921 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3922 get(AArch64::STPXi), SrcReg, isKill, 3923 AArch64::sube64, AArch64::subo64, FI, MMO); 3924 return; 3925 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3926 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3927 Opc = AArch64::STR_ZXI; 3928 StackID = TargetStackID::ScalableVector; 3929 } 3930 break; 3931 case 24: 3932 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3933 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3934 Opc = AArch64::ST1Threev1d; 3935 Offset = false; 3936 } 3937 break; 3938 case 32: 3939 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3940 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3941 Opc = AArch64::ST1Fourv1d; 3942 Offset = false; 3943 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3944 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3945 Opc = AArch64::ST1Twov2d; 3946 Offset = false; 3947 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3948 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3949 Opc = AArch64::STR_ZZXI; 3950 StackID = TargetStackID::ScalableVector; 3951 } 3952 break; 3953 case 48: 3954 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3955 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3956 Opc = AArch64::ST1Threev2d; 3957 Offset = false; 3958 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3959 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3960 Opc = AArch64::STR_ZZZXI; 3961 StackID = TargetStackID::ScalableVector; 3962 } 3963 break; 3964 case 64: 3965 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3966 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3967 Opc = AArch64::ST1Fourv2d; 3968 Offset = false; 3969 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3970 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3971 Opc = AArch64::STR_ZZZZXI; 3972 StackID = TargetStackID::ScalableVector; 3973 } 3974 break; 3975 } 3976 assert(Opc && "Unknown register class"); 3977 MFI.setStackID(FI, StackID); 3978 3979 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3980 .addReg(SrcReg, getKillRegState(isKill)) 3981 .addFrameIndex(FI); 3982 3983 if (Offset) 3984 MI.addImm(0); 3985 MI.addMemOperand(MMO); 3986 } 3987 3988 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3989 MachineBasicBlock &MBB, 3990 MachineBasicBlock::iterator InsertBefore, 3991 const MCInstrDesc &MCID, 3992 Register DestReg, unsigned SubIdx0, 3993 unsigned SubIdx1, int FI, 3994 MachineMemOperand *MMO) { 3995 Register DestReg0 = DestReg; 3996 Register DestReg1 = DestReg; 3997 bool IsUndef = true; 3998 if (DestReg.isPhysical()) { 3999 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 4000 SubIdx0 = 0; 4001 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 4002 SubIdx1 = 0; 4003 IsUndef = false; 4004 } 4005 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 4006 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 4007 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 4008 .addFrameIndex(FI) 4009 .addImm(0) 4010 .addMemOperand(MMO); 4011 } 4012 4013 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 4014 MachineBasicBlock::iterator MBBI, 4015 Register DestReg, int FI, 4016 const TargetRegisterClass *RC, 4017 const TargetRegisterInfo *TRI, 4018 Register VReg) const { 4019 MachineFunction &MF = *MBB.getParent(); 4020 MachineFrameInfo &MFI = MF.getFrameInfo(); 4021 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 4022 MachineMemOperand *MMO = 4023 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 4024 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 4025 4026 unsigned Opc = 0; 4027 bool Offset = true; 4028 unsigned StackID = TargetStackID::Default; 4029 switch (TRI->getSpillSize(*RC)) { 4030 case 1: 4031 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 4032 Opc = AArch64::LDRBui; 4033 break; 4034 case 2: 4035 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 4036 Opc = AArch64::LDRHui; 4037 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4038 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4039 Opc = AArch64::LDR_PXI; 4040 StackID = TargetStackID::ScalableVector; 4041 } 4042 break; 4043 case 4: 4044 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4045 Opc = AArch64::LDRWui; 4046 if (DestReg.isVirtual()) 4047 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 4048 else 4049 assert(DestReg != AArch64::WSP); 4050 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4051 Opc = AArch64::LDRSui; 4052 break; 4053 case 8: 4054 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4055 Opc = AArch64::LDRXui; 4056 if (DestReg.isVirtual()) 4057 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 4058 else 4059 assert(DestReg != AArch64::SP); 4060 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4061 Opc = AArch64::LDRDui; 4062 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4063 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4064 get(AArch64::LDPWi), DestReg, AArch64::sube32, 4065 AArch64::subo32, FI, MMO); 4066 return; 4067 } 4068 break; 4069 case 16: 4070 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4071 Opc = AArch64::LDRQui; 4072 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4073 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4074 Opc = AArch64::LD1Twov1d; 4075 Offset = false; 4076 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4077 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4078 get(AArch64::LDPXi), DestReg, AArch64::sube64, 4079 AArch64::subo64, FI, MMO); 4080 return; 4081 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4082 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4083 Opc = AArch64::LDR_ZXI; 4084 StackID = TargetStackID::ScalableVector; 4085 } 4086 break; 4087 case 24: 4088 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4089 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4090 Opc = AArch64::LD1Threev1d; 4091 Offset = false; 4092 } 4093 break; 4094 case 32: 4095 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4096 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4097 Opc = AArch64::LD1Fourv1d; 4098 Offset = false; 4099 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4100 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4101 Opc = AArch64::LD1Twov2d; 4102 Offset = false; 4103 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 4104 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4105 Opc = AArch64::LDR_ZZXI; 4106 StackID = TargetStackID::ScalableVector; 4107 } 4108 break; 4109 case 48: 4110 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4111 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4112 Opc = AArch64::LD1Threev2d; 4113 Offset = false; 4114 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4115 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4116 Opc = AArch64::LDR_ZZZXI; 4117 StackID = TargetStackID::ScalableVector; 4118 } 4119 break; 4120 case 64: 4121 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4122 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4123 Opc = AArch64::LD1Fourv2d; 4124 Offset = false; 4125 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 4126 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4127 Opc = AArch64::LDR_ZZZZXI; 4128 StackID = TargetStackID::ScalableVector; 4129 } 4130 break; 4131 } 4132 4133 assert(Opc && "Unknown register class"); 4134 MFI.setStackID(FI, StackID); 4135 4136 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4137 .addReg(DestReg, getDefRegState(true)) 4138 .addFrameIndex(FI); 4139 if (Offset) 4140 MI.addImm(0); 4141 MI.addMemOperand(MMO); 4142 } 4143 4144 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 4145 const MachineInstr &UseMI, 4146 const TargetRegisterInfo *TRI) { 4147 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 4148 UseMI.getIterator()), 4149 [TRI](const MachineInstr &I) { 4150 return I.modifiesRegister(AArch64::NZCV, TRI) || 4151 I.readsRegister(AArch64::NZCV, TRI); 4152 }); 4153 } 4154 4155 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4156 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 4157 // The smallest scalable element supported by scaled SVE addressing 4158 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4159 // byte offset must always be a multiple of 2. 4160 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4161 4162 // VGSized offsets are divided by '2', because the VG register is the 4163 // the number of 64bit granules as opposed to 128bit vector chunks, 4164 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 4165 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 4166 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 4167 ByteSized = Offset.getFixed(); 4168 VGSized = Offset.getScalable() / 2; 4169 } 4170 4171 /// Returns the offset in parts to which this frame offset can be 4172 /// decomposed for the purpose of describing a frame offset. 4173 /// For non-scalable offsets this is simply its byte size. 4174 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4175 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 4176 int64_t &NumDataVectors) { 4177 // The smallest scalable element supported by scaled SVE addressing 4178 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4179 // byte offset must always be a multiple of 2. 4180 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4181 4182 NumBytes = Offset.getFixed(); 4183 NumDataVectors = 0; 4184 NumPredicateVectors = Offset.getScalable() / 2; 4185 // This method is used to get the offsets to adjust the frame offset. 4186 // If the function requires ADDPL to be used and needs more than two ADDPL 4187 // instructions, part of the offset is folded into NumDataVectors so that it 4188 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 4189 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 4190 NumPredicateVectors > 62) { 4191 NumDataVectors = NumPredicateVectors / 8; 4192 NumPredicateVectors -= NumDataVectors * 8; 4193 } 4194 } 4195 4196 // Convenience function to create a DWARF expression for 4197 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 4198 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 4199 int NumVGScaledBytes, unsigned VG, 4200 llvm::raw_string_ostream &Comment) { 4201 uint8_t buffer[16]; 4202 4203 if (NumBytes) { 4204 Expr.push_back(dwarf::DW_OP_consts); 4205 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 4206 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4207 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 4208 } 4209 4210 if (NumVGScaledBytes) { 4211 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 4212 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 4213 4214 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 4215 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 4216 Expr.push_back(0); 4217 4218 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 4219 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4220 4221 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 4222 << std::abs(NumVGScaledBytes) << " * VG"; 4223 } 4224 } 4225 4226 // Creates an MCCFIInstruction: 4227 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 4228 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 4229 unsigned Reg, 4230 const StackOffset &Offset) { 4231 int64_t NumBytes, NumVGScaledBytes; 4232 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 4233 NumVGScaledBytes); 4234 std::string CommentBuffer; 4235 llvm::raw_string_ostream Comment(CommentBuffer); 4236 4237 if (Reg == AArch64::SP) 4238 Comment << "sp"; 4239 else if (Reg == AArch64::FP) 4240 Comment << "fp"; 4241 else 4242 Comment << printReg(Reg, &TRI); 4243 4244 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 4245 SmallString<64> Expr; 4246 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4247 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 4248 Expr.push_back(0); 4249 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 4250 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4251 4252 // Wrap this into DW_CFA_def_cfa. 4253 SmallString<64> DefCfaExpr; 4254 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 4255 uint8_t buffer[16]; 4256 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 4257 DefCfaExpr.append(Expr.str()); 4258 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), 4259 Comment.str()); 4260 } 4261 4262 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 4263 unsigned FrameReg, unsigned Reg, 4264 const StackOffset &Offset, 4265 bool LastAdjustmentWasScalable) { 4266 if (Offset.getScalable()) 4267 return createDefCFAExpression(TRI, Reg, Offset); 4268 4269 if (FrameReg == Reg && !LastAdjustmentWasScalable) 4270 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 4271 4272 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4273 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 4274 } 4275 4276 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 4277 unsigned Reg, 4278 const StackOffset &OffsetFromDefCFA) { 4279 int64_t NumBytes, NumVGScaledBytes; 4280 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4281 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 4282 4283 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4284 4285 // Non-scalable offsets can use DW_CFA_offset directly. 4286 if (!NumVGScaledBytes) 4287 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 4288 4289 std::string CommentBuffer; 4290 llvm::raw_string_ostream Comment(CommentBuffer); 4291 Comment << printReg(Reg, &TRI) << " @ cfa"; 4292 4293 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 4294 SmallString<64> OffsetExpr; 4295 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 4296 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4297 4298 // Wrap this into DW_CFA_expression 4299 SmallString<64> CfaExpr; 4300 CfaExpr.push_back(dwarf::DW_CFA_expression); 4301 uint8_t buffer[16]; 4302 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 4303 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 4304 CfaExpr.append(OffsetExpr.str()); 4305 4306 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), SMLoc(), 4307 Comment.str()); 4308 } 4309 4310 // Helper function to emit a frame offset adjustment from a given 4311 // pointer (SrcReg), stored into DestReg. This function is explicit 4312 // in that it requires the opcode. 4313 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 4314 MachineBasicBlock::iterator MBBI, 4315 const DebugLoc &DL, unsigned DestReg, 4316 unsigned SrcReg, int64_t Offset, unsigned Opc, 4317 const TargetInstrInfo *TII, 4318 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 4319 bool *HasWinCFI, bool EmitCFAOffset, 4320 StackOffset CFAOffset, unsigned FrameReg) { 4321 int Sign = 1; 4322 unsigned MaxEncoding, ShiftSize; 4323 switch (Opc) { 4324 case AArch64::ADDXri: 4325 case AArch64::ADDSXri: 4326 case AArch64::SUBXri: 4327 case AArch64::SUBSXri: 4328 MaxEncoding = 0xfff; 4329 ShiftSize = 12; 4330 break; 4331 case AArch64::ADDVL_XXI: 4332 case AArch64::ADDPL_XXI: 4333 case AArch64::ADDSVL_XXI: 4334 case AArch64::ADDSPL_XXI: 4335 MaxEncoding = 31; 4336 ShiftSize = 0; 4337 if (Offset < 0) { 4338 MaxEncoding = 32; 4339 Sign = -1; 4340 Offset = -Offset; 4341 } 4342 break; 4343 default: 4344 llvm_unreachable("Unsupported opcode"); 4345 } 4346 4347 // `Offset` can be in bytes or in "scalable bytes". 4348 int VScale = 1; 4349 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 4350 VScale = 16; 4351 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 4352 VScale = 2; 4353 4354 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 4355 // scratch register. If DestReg is a virtual register, use it as the 4356 // scratch register; otherwise, create a new virtual register (to be 4357 // replaced by the scavenger at the end of PEI). That case can be optimized 4358 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 4359 // register can be loaded with offset%8 and the add/sub can use an extending 4360 // instruction with LSL#3. 4361 // Currently the function handles any offsets but generates a poor sequence 4362 // of code. 4363 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 4364 4365 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 4366 Register TmpReg = DestReg; 4367 if (TmpReg == AArch64::XZR) 4368 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 4369 &AArch64::GPR64RegClass); 4370 do { 4371 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 4372 unsigned LocalShiftSize = 0; 4373 if (ThisVal > MaxEncoding) { 4374 ThisVal = ThisVal >> ShiftSize; 4375 LocalShiftSize = ShiftSize; 4376 } 4377 assert((ThisVal >> ShiftSize) <= MaxEncoding && 4378 "Encoding cannot handle value that big"); 4379 4380 Offset -= ThisVal << LocalShiftSize; 4381 if (Offset == 0) 4382 TmpReg = DestReg; 4383 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 4384 .addReg(SrcReg) 4385 .addImm(Sign * (int)ThisVal); 4386 if (ShiftSize) 4387 MBI = MBI.addImm( 4388 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4389 MBI = MBI.setMIFlag(Flag); 4390 4391 auto Change = 4392 VScale == 1 4393 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 4394 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 4395 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 4396 CFAOffset += Change; 4397 else 4398 CFAOffset -= Change; 4399 if (EmitCFAOffset && DestReg == TmpReg) { 4400 MachineFunction &MF = *MBB.getParent(); 4401 const TargetSubtargetInfo &STI = MF.getSubtarget(); 4402 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 4403 4404 unsigned CFIIndex = MF.addFrameInst( 4405 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 4406 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 4407 .addCFIIndex(CFIIndex) 4408 .setMIFlags(Flag); 4409 } 4410 4411 if (NeedsWinCFI) { 4412 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4413 int Imm = (int)(ThisVal << LocalShiftSize); 4414 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4415 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4416 if (HasWinCFI) 4417 *HasWinCFI = true; 4418 if (Imm == 0) 4419 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4420 else 4421 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4422 .addImm(Imm) 4423 .setMIFlag(Flag); 4424 assert(Offset == 0 && "Expected remaining offset to be zero to " 4425 "emit a single SEH directive"); 4426 } else if (DestReg == AArch64::SP) { 4427 if (HasWinCFI) 4428 *HasWinCFI = true; 4429 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4430 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4431 .addImm(Imm) 4432 .setMIFlag(Flag); 4433 } 4434 } 4435 4436 SrcReg = TmpReg; 4437 } while (Offset); 4438 } 4439 4440 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4441 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4442 unsigned DestReg, unsigned SrcReg, 4443 StackOffset Offset, const TargetInstrInfo *TII, 4444 MachineInstr::MIFlag Flag, bool SetNZCV, 4445 bool NeedsWinCFI, bool *HasWinCFI, 4446 bool EmitCFAOffset, StackOffset CFAOffset, 4447 unsigned FrameReg) { 4448 // If a function is marked as arm_locally_streaming, then the runtime value of 4449 // vscale in the prologue/epilogue is different the runtime value of vscale 4450 // in the function's body. To avoid having to consider multiple vscales, 4451 // we can use `addsvl` to allocate any scalable stack-slots, which under 4452 // most circumstances will be only locals, not callee-save slots. 4453 const Function &F = MBB.getParent()->getFunction(); 4454 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 4455 4456 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4457 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4458 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4459 4460 // First emit non-scalable frame offsets, or a simple 'mov'. 4461 if (Bytes || (!Offset && SrcReg != DestReg)) { 4462 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4463 "SP increment/decrement not 8-byte aligned"); 4464 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4465 if (Bytes < 0) { 4466 Bytes = -Bytes; 4467 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4468 } 4469 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4470 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 4471 FrameReg); 4472 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 4473 ? StackOffset::getFixed(-Bytes) 4474 : StackOffset::getFixed(Bytes); 4475 SrcReg = DestReg; 4476 FrameReg = DestReg; 4477 } 4478 4479 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4480 "SetNZCV not supported with SVE vectors"); 4481 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4482 "WinCFI not supported with SVE vectors"); 4483 4484 if (NumDataVectors) { 4485 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4486 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 4487 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4488 CFAOffset, FrameReg); 4489 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 4490 SrcReg = DestReg; 4491 } 4492 4493 if (NumPredicateVectors) { 4494 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4495 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4496 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 4497 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4498 CFAOffset, FrameReg); 4499 } 4500 } 4501 4502 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4503 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4504 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4505 LiveIntervals *LIS, VirtRegMap *VRM) const { 4506 // This is a bit of a hack. Consider this instruction: 4507 // 4508 // %0 = COPY %sp; GPR64all:%0 4509 // 4510 // We explicitly chose GPR64all for the virtual register so such a copy might 4511 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4512 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4513 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4514 // 4515 // To prevent that, we are going to constrain the %0 register class here. 4516 // 4517 // <rdar://problem/11522048> 4518 // 4519 if (MI.isFullCopy()) { 4520 Register DstReg = MI.getOperand(0).getReg(); 4521 Register SrcReg = MI.getOperand(1).getReg(); 4522 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 4523 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4524 return nullptr; 4525 } 4526 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 4527 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4528 return nullptr; 4529 } 4530 // Nothing can folded with copy from/to NZCV. 4531 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 4532 return nullptr; 4533 } 4534 4535 // Handle the case where a copy is being spilled or filled but the source 4536 // and destination register class don't match. For example: 4537 // 4538 // %0 = COPY %xzr; GPR64common:%0 4539 // 4540 // In this case we can still safely fold away the COPY and generate the 4541 // following spill code: 4542 // 4543 // STRXui %xzr, %stack.0 4544 // 4545 // This also eliminates spilled cross register class COPYs (e.g. between x and 4546 // d regs) of the same size. For example: 4547 // 4548 // %0 = COPY %1; GPR64:%0, FPR64:%1 4549 // 4550 // will be filled as 4551 // 4552 // LDRDui %0, fi<#0> 4553 // 4554 // instead of 4555 // 4556 // LDRXui %Temp, fi<#0> 4557 // %0 = FMOV %Temp 4558 // 4559 if (MI.isCopy() && Ops.size() == 1 && 4560 // Make sure we're only folding the explicit COPY defs/uses. 4561 (Ops[0] == 0 || Ops[0] == 1)) { 4562 bool IsSpill = Ops[0] == 0; 4563 bool IsFill = !IsSpill; 4564 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4565 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4566 MachineBasicBlock &MBB = *MI.getParent(); 4567 const MachineOperand &DstMO = MI.getOperand(0); 4568 const MachineOperand &SrcMO = MI.getOperand(1); 4569 Register DstReg = DstMO.getReg(); 4570 Register SrcReg = SrcMO.getReg(); 4571 // This is slightly expensive to compute for physical regs since 4572 // getMinimalPhysRegClass is slow. 4573 auto getRegClass = [&](unsigned Reg) { 4574 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4575 : TRI.getMinimalPhysRegClass(Reg); 4576 }; 4577 4578 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4579 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4580 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4581 "Mismatched register size in non subreg COPY"); 4582 if (IsSpill) 4583 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4584 getRegClass(SrcReg), &TRI, Register()); 4585 else 4586 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4587 getRegClass(DstReg), &TRI, Register()); 4588 return &*--InsertPt; 4589 } 4590 4591 // Handle cases like spilling def of: 4592 // 4593 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4594 // 4595 // where the physical register source can be widened and stored to the full 4596 // virtual reg destination stack slot, in this case producing: 4597 // 4598 // STRXui %xzr, %stack.0 4599 // 4600 if (IsSpill && DstMO.isUndef() && SrcReg.isPhysical()) { 4601 assert(SrcMO.getSubReg() == 0 && 4602 "Unexpected subreg on physical register"); 4603 const TargetRegisterClass *SpillRC; 4604 unsigned SpillSubreg; 4605 switch (DstMO.getSubReg()) { 4606 default: 4607 SpillRC = nullptr; 4608 break; 4609 case AArch64::sub_32: 4610 case AArch64::ssub: 4611 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4612 SpillRC = &AArch64::GPR64RegClass; 4613 SpillSubreg = AArch64::sub_32; 4614 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4615 SpillRC = &AArch64::FPR64RegClass; 4616 SpillSubreg = AArch64::ssub; 4617 } else 4618 SpillRC = nullptr; 4619 break; 4620 case AArch64::dsub: 4621 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4622 SpillRC = &AArch64::FPR128RegClass; 4623 SpillSubreg = AArch64::dsub; 4624 } else 4625 SpillRC = nullptr; 4626 break; 4627 } 4628 4629 if (SpillRC) 4630 if (unsigned WidenedSrcReg = 4631 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4632 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4633 FrameIndex, SpillRC, &TRI, Register()); 4634 return &*--InsertPt; 4635 } 4636 } 4637 4638 // Handle cases like filling use of: 4639 // 4640 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4641 // 4642 // where we can load the full virtual reg source stack slot, into the subreg 4643 // destination, in this case producing: 4644 // 4645 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4646 // 4647 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4648 const TargetRegisterClass *FillRC; 4649 switch (DstMO.getSubReg()) { 4650 default: 4651 FillRC = nullptr; 4652 break; 4653 case AArch64::sub_32: 4654 FillRC = &AArch64::GPR32RegClass; 4655 break; 4656 case AArch64::ssub: 4657 FillRC = &AArch64::FPR32RegClass; 4658 break; 4659 case AArch64::dsub: 4660 FillRC = &AArch64::FPR64RegClass; 4661 break; 4662 } 4663 4664 if (FillRC) { 4665 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4666 TRI.getRegSizeInBits(*FillRC) && 4667 "Mismatched regclass size on folded subreg COPY"); 4668 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 4669 Register()); 4670 MachineInstr &LoadMI = *--InsertPt; 4671 MachineOperand &LoadDst = LoadMI.getOperand(0); 4672 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4673 LoadDst.setSubReg(DstMO.getSubReg()); 4674 LoadDst.setIsUndef(); 4675 return &LoadMI; 4676 } 4677 } 4678 } 4679 4680 // Cannot fold. 4681 return nullptr; 4682 } 4683 4684 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4685 StackOffset &SOffset, 4686 bool *OutUseUnscaledOp, 4687 unsigned *OutUnscaledOp, 4688 int64_t *EmittableOffset) { 4689 // Set output values in case of early exit. 4690 if (EmittableOffset) 4691 *EmittableOffset = 0; 4692 if (OutUseUnscaledOp) 4693 *OutUseUnscaledOp = false; 4694 if (OutUnscaledOp) 4695 *OutUnscaledOp = 0; 4696 4697 // Exit early for structured vector spills/fills as they can't take an 4698 // immediate offset. 4699 switch (MI.getOpcode()) { 4700 default: 4701 break; 4702 case AArch64::LD1Twov2d: 4703 case AArch64::LD1Threev2d: 4704 case AArch64::LD1Fourv2d: 4705 case AArch64::LD1Twov1d: 4706 case AArch64::LD1Threev1d: 4707 case AArch64::LD1Fourv1d: 4708 case AArch64::ST1Twov2d: 4709 case AArch64::ST1Threev2d: 4710 case AArch64::ST1Fourv2d: 4711 case AArch64::ST1Twov1d: 4712 case AArch64::ST1Threev1d: 4713 case AArch64::ST1Fourv1d: 4714 case AArch64::ST1i8: 4715 case AArch64::ST1i16: 4716 case AArch64::ST1i32: 4717 case AArch64::ST1i64: 4718 case AArch64::IRG: 4719 case AArch64::IRGstack: 4720 case AArch64::STGloop: 4721 case AArch64::STZGloop: 4722 return AArch64FrameOffsetCannotUpdate; 4723 } 4724 4725 // Get the min/max offset and the scale. 4726 TypeSize ScaleValue(0U, false); 4727 unsigned Width; 4728 int64_t MinOff, MaxOff; 4729 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4730 MaxOff)) 4731 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4732 4733 // Construct the complete offset. 4734 bool IsMulVL = ScaleValue.isScalable(); 4735 unsigned Scale = ScaleValue.getKnownMinValue(); 4736 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4737 4738 const MachineOperand &ImmOpnd = 4739 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4740 Offset += ImmOpnd.getImm() * Scale; 4741 4742 // If the offset doesn't match the scale, we rewrite the instruction to 4743 // use the unscaled instruction instead. Likewise, if we have a negative 4744 // offset and there is an unscaled op to use. 4745 std::optional<unsigned> UnscaledOp = 4746 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4747 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4748 if (useUnscaledOp && 4749 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4750 MaxOff)) 4751 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4752 4753 Scale = ScaleValue.getKnownMinValue(); 4754 assert(IsMulVL == ScaleValue.isScalable() && 4755 "Unscaled opcode has different value for scalable"); 4756 4757 int64_t Remainder = Offset % Scale; 4758 assert(!(Remainder && useUnscaledOp) && 4759 "Cannot have remainder when using unscaled op"); 4760 4761 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4762 int64_t NewOffset = Offset / Scale; 4763 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4764 Offset = Remainder; 4765 else { 4766 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4767 Offset = Offset - NewOffset * Scale; 4768 } 4769 4770 if (EmittableOffset) 4771 *EmittableOffset = NewOffset; 4772 if (OutUseUnscaledOp) 4773 *OutUseUnscaledOp = useUnscaledOp; 4774 if (OutUnscaledOp && UnscaledOp) 4775 *OutUnscaledOp = *UnscaledOp; 4776 4777 if (IsMulVL) 4778 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4779 else 4780 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4781 return AArch64FrameOffsetCanUpdate | 4782 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4783 } 4784 4785 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4786 unsigned FrameReg, StackOffset &Offset, 4787 const AArch64InstrInfo *TII) { 4788 unsigned Opcode = MI.getOpcode(); 4789 unsigned ImmIdx = FrameRegIdx + 1; 4790 4791 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4792 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4793 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4794 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4795 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4796 MI.eraseFromParent(); 4797 Offset = StackOffset(); 4798 return true; 4799 } 4800 4801 int64_t NewOffset; 4802 unsigned UnscaledOp; 4803 bool UseUnscaledOp; 4804 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4805 &UnscaledOp, &NewOffset); 4806 if (Status & AArch64FrameOffsetCanUpdate) { 4807 if (Status & AArch64FrameOffsetIsLegal) 4808 // Replace the FrameIndex with FrameReg. 4809 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4810 if (UseUnscaledOp) 4811 MI.setDesc(TII->get(UnscaledOp)); 4812 4813 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4814 return !Offset; 4815 } 4816 4817 return false; 4818 } 4819 4820 MCInst AArch64InstrInfo::getNop() const { 4821 return MCInstBuilder(AArch64::HINT).addImm(0); 4822 } 4823 4824 // AArch64 supports MachineCombiner. 4825 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4826 4827 // True when Opc sets flag 4828 static bool isCombineInstrSettingFlag(unsigned Opc) { 4829 switch (Opc) { 4830 case AArch64::ADDSWrr: 4831 case AArch64::ADDSWri: 4832 case AArch64::ADDSXrr: 4833 case AArch64::ADDSXri: 4834 case AArch64::SUBSWrr: 4835 case AArch64::SUBSXrr: 4836 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4837 case AArch64::SUBSWri: 4838 case AArch64::SUBSXri: 4839 return true; 4840 default: 4841 break; 4842 } 4843 return false; 4844 } 4845 4846 // 32b Opcodes that can be combined with a MUL 4847 static bool isCombineInstrCandidate32(unsigned Opc) { 4848 switch (Opc) { 4849 case AArch64::ADDWrr: 4850 case AArch64::ADDWri: 4851 case AArch64::SUBWrr: 4852 case AArch64::ADDSWrr: 4853 case AArch64::ADDSWri: 4854 case AArch64::SUBSWrr: 4855 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4856 case AArch64::SUBWri: 4857 case AArch64::SUBSWri: 4858 return true; 4859 default: 4860 break; 4861 } 4862 return false; 4863 } 4864 4865 // 64b Opcodes that can be combined with a MUL 4866 static bool isCombineInstrCandidate64(unsigned Opc) { 4867 switch (Opc) { 4868 case AArch64::ADDXrr: 4869 case AArch64::ADDXri: 4870 case AArch64::SUBXrr: 4871 case AArch64::ADDSXrr: 4872 case AArch64::ADDSXri: 4873 case AArch64::SUBSXrr: 4874 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4875 case AArch64::SUBXri: 4876 case AArch64::SUBSXri: 4877 case AArch64::ADDv8i8: 4878 case AArch64::ADDv16i8: 4879 case AArch64::ADDv4i16: 4880 case AArch64::ADDv8i16: 4881 case AArch64::ADDv2i32: 4882 case AArch64::ADDv4i32: 4883 case AArch64::SUBv8i8: 4884 case AArch64::SUBv16i8: 4885 case AArch64::SUBv4i16: 4886 case AArch64::SUBv8i16: 4887 case AArch64::SUBv2i32: 4888 case AArch64::SUBv4i32: 4889 return true; 4890 default: 4891 break; 4892 } 4893 return false; 4894 } 4895 4896 // FP Opcodes that can be combined with a FMUL. 4897 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4898 switch (Inst.getOpcode()) { 4899 default: 4900 break; 4901 case AArch64::FADDHrr: 4902 case AArch64::FADDSrr: 4903 case AArch64::FADDDrr: 4904 case AArch64::FADDv4f16: 4905 case AArch64::FADDv8f16: 4906 case AArch64::FADDv2f32: 4907 case AArch64::FADDv2f64: 4908 case AArch64::FADDv4f32: 4909 case AArch64::FSUBHrr: 4910 case AArch64::FSUBSrr: 4911 case AArch64::FSUBDrr: 4912 case AArch64::FSUBv4f16: 4913 case AArch64::FSUBv8f16: 4914 case AArch64::FSUBv2f32: 4915 case AArch64::FSUBv2f64: 4916 case AArch64::FSUBv4f32: 4917 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4918 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4919 // the target options or if FADD/FSUB has the contract fast-math flag. 4920 return Options.UnsafeFPMath || 4921 Options.AllowFPOpFusion == FPOpFusion::Fast || 4922 Inst.getFlag(MachineInstr::FmContract); 4923 return true; 4924 } 4925 return false; 4926 } 4927 4928 // Opcodes that can be combined with a MUL 4929 static bool isCombineInstrCandidate(unsigned Opc) { 4930 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4931 } 4932 4933 // 4934 // Utility routine that checks if \param MO is defined by an 4935 // \param CombineOpc instruction in the basic block \param MBB 4936 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4937 unsigned CombineOpc, unsigned ZeroReg = 0, 4938 bool CheckZeroReg = false) { 4939 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4940 MachineInstr *MI = nullptr; 4941 4942 if (MO.isReg() && MO.getReg().isVirtual()) 4943 MI = MRI.getUniqueVRegDef(MO.getReg()); 4944 // And it needs to be in the trace (otherwise, it won't have a depth). 4945 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4946 return false; 4947 // Must only used by the user we combine with. 4948 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4949 return false; 4950 4951 if (CheckZeroReg) { 4952 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4953 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4954 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4955 // The third input reg must be zero. 4956 if (MI->getOperand(3).getReg() != ZeroReg) 4957 return false; 4958 } 4959 4960 if (isCombineInstrSettingFlag(CombineOpc) && 4961 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 4962 return false; 4963 4964 return true; 4965 } 4966 4967 // 4968 // Is \param MO defined by an integer multiply and can be combined? 4969 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4970 unsigned MulOpc, unsigned ZeroReg) { 4971 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4972 } 4973 4974 // 4975 // Is \param MO defined by a floating-point multiply and can be combined? 4976 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4977 unsigned MulOpc) { 4978 return canCombine(MBB, MO, MulOpc); 4979 } 4980 4981 // TODO: There are many more machine instruction opcodes to match: 4982 // 1. Other data types (integer, vectors) 4983 // 2. Other math / logic operations (xor, or) 4984 // 3. Other forms of the same operation (intrinsics and other variants) 4985 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 4986 bool Invert) const { 4987 if (Invert) 4988 return false; 4989 switch (Inst.getOpcode()) { 4990 // == Floating-point types == 4991 // -- Floating-point instructions -- 4992 case AArch64::FADDHrr: 4993 case AArch64::FADDSrr: 4994 case AArch64::FADDDrr: 4995 case AArch64::FMULHrr: 4996 case AArch64::FMULSrr: 4997 case AArch64::FMULDrr: 4998 case AArch64::FMULX16: 4999 case AArch64::FMULX32: 5000 case AArch64::FMULX64: 5001 // -- Advanced SIMD instructions -- 5002 case AArch64::FADDv4f16: 5003 case AArch64::FADDv8f16: 5004 case AArch64::FADDv2f32: 5005 case AArch64::FADDv4f32: 5006 case AArch64::FADDv2f64: 5007 case AArch64::FMULv4f16: 5008 case AArch64::FMULv8f16: 5009 case AArch64::FMULv2f32: 5010 case AArch64::FMULv4f32: 5011 case AArch64::FMULv2f64: 5012 case AArch64::FMULXv4f16: 5013 case AArch64::FMULXv8f16: 5014 case AArch64::FMULXv2f32: 5015 case AArch64::FMULXv4f32: 5016 case AArch64::FMULXv2f64: 5017 // -- SVE instructions -- 5018 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 5019 // in the SVE instruction set (though there are predicated ones). 5020 case AArch64::FADD_ZZZ_H: 5021 case AArch64::FADD_ZZZ_S: 5022 case AArch64::FADD_ZZZ_D: 5023 case AArch64::FMUL_ZZZ_H: 5024 case AArch64::FMUL_ZZZ_S: 5025 case AArch64::FMUL_ZZZ_D: 5026 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 5027 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 5028 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 5029 5030 // == Integer types == 5031 // -- Base instructions -- 5032 // Opcodes MULWrr and MULXrr don't exist because 5033 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 5034 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 5035 // The machine-combiner does not support three-source-operands machine 5036 // instruction. So we cannot reassociate MULs. 5037 case AArch64::ADDWrr: 5038 case AArch64::ADDXrr: 5039 case AArch64::ANDWrr: 5040 case AArch64::ANDXrr: 5041 case AArch64::ORRWrr: 5042 case AArch64::ORRXrr: 5043 case AArch64::EORWrr: 5044 case AArch64::EORXrr: 5045 case AArch64::EONWrr: 5046 case AArch64::EONXrr: 5047 // -- Advanced SIMD instructions -- 5048 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 5049 // in the Advanced SIMD instruction set. 5050 case AArch64::ADDv8i8: 5051 case AArch64::ADDv16i8: 5052 case AArch64::ADDv4i16: 5053 case AArch64::ADDv8i16: 5054 case AArch64::ADDv2i32: 5055 case AArch64::ADDv4i32: 5056 case AArch64::ADDv1i64: 5057 case AArch64::ADDv2i64: 5058 case AArch64::MULv8i8: 5059 case AArch64::MULv16i8: 5060 case AArch64::MULv4i16: 5061 case AArch64::MULv8i16: 5062 case AArch64::MULv2i32: 5063 case AArch64::MULv4i32: 5064 case AArch64::ANDv8i8: 5065 case AArch64::ANDv16i8: 5066 case AArch64::ORRv8i8: 5067 case AArch64::ORRv16i8: 5068 case AArch64::EORv8i8: 5069 case AArch64::EORv16i8: 5070 // -- SVE instructions -- 5071 case AArch64::ADD_ZZZ_B: 5072 case AArch64::ADD_ZZZ_H: 5073 case AArch64::ADD_ZZZ_S: 5074 case AArch64::ADD_ZZZ_D: 5075 case AArch64::MUL_ZZZ_B: 5076 case AArch64::MUL_ZZZ_H: 5077 case AArch64::MUL_ZZZ_S: 5078 case AArch64::MUL_ZZZ_D: 5079 case AArch64::AND_ZZZ: 5080 case AArch64::ORR_ZZZ: 5081 case AArch64::EOR_ZZZ: 5082 return true; 5083 5084 default: 5085 return false; 5086 } 5087 } 5088 5089 /// Find instructions that can be turned into madd. 5090 static bool getMaddPatterns(MachineInstr &Root, 5091 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5092 unsigned Opc = Root.getOpcode(); 5093 MachineBasicBlock &MBB = *Root.getParent(); 5094 bool Found = false; 5095 5096 if (!isCombineInstrCandidate(Opc)) 5097 return false; 5098 if (isCombineInstrSettingFlag(Opc)) { 5099 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 5100 // When NZCV is live bail out. 5101 if (Cmp_NZCV == -1) 5102 return false; 5103 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 5104 // When opcode can't change bail out. 5105 // CHECKME: do we miss any cases for opcode conversion? 5106 if (NewOpc == Opc) 5107 return false; 5108 Opc = NewOpc; 5109 } 5110 5111 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 5112 MachineCombinerPattern Pattern) { 5113 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 5114 Patterns.push_back(Pattern); 5115 Found = true; 5116 } 5117 }; 5118 5119 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 5120 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 5121 Patterns.push_back(Pattern); 5122 Found = true; 5123 } 5124 }; 5125 5126 typedef MachineCombinerPattern MCP; 5127 5128 switch (Opc) { 5129 default: 5130 break; 5131 case AArch64::ADDWrr: 5132 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5133 "ADDWrr does not have register operands"); 5134 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 5135 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 5136 break; 5137 case AArch64::ADDXrr: 5138 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 5139 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 5140 break; 5141 case AArch64::SUBWrr: 5142 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 5143 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 5144 break; 5145 case AArch64::SUBXrr: 5146 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 5147 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 5148 break; 5149 case AArch64::ADDWri: 5150 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 5151 break; 5152 case AArch64::ADDXri: 5153 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 5154 break; 5155 case AArch64::SUBWri: 5156 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 5157 break; 5158 case AArch64::SUBXri: 5159 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 5160 break; 5161 case AArch64::ADDv8i8: 5162 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 5163 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 5164 break; 5165 case AArch64::ADDv16i8: 5166 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 5167 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 5168 break; 5169 case AArch64::ADDv4i16: 5170 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 5171 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 5172 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 5173 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 5174 break; 5175 case AArch64::ADDv8i16: 5176 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 5177 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 5178 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 5179 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 5180 break; 5181 case AArch64::ADDv2i32: 5182 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 5183 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 5184 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 5185 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 5186 break; 5187 case AArch64::ADDv4i32: 5188 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 5189 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 5190 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 5191 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 5192 break; 5193 case AArch64::SUBv8i8: 5194 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 5195 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 5196 break; 5197 case AArch64::SUBv16i8: 5198 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 5199 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 5200 break; 5201 case AArch64::SUBv4i16: 5202 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 5203 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 5204 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 5205 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 5206 break; 5207 case AArch64::SUBv8i16: 5208 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 5209 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 5210 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 5211 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 5212 break; 5213 case AArch64::SUBv2i32: 5214 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 5215 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 5216 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 5217 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 5218 break; 5219 case AArch64::SUBv4i32: 5220 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 5221 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 5222 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 5223 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 5224 break; 5225 } 5226 return Found; 5227 } 5228 /// Floating-Point Support 5229 5230 /// Find instructions that can be turned into madd. 5231 static bool getFMAPatterns(MachineInstr &Root, 5232 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5233 5234 if (!isCombineInstrCandidateFP(Root)) 5235 return false; 5236 5237 MachineBasicBlock &MBB = *Root.getParent(); 5238 bool Found = false; 5239 5240 auto Match = [&](int Opcode, int Operand, 5241 MachineCombinerPattern Pattern) -> bool { 5242 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 5243 Patterns.push_back(Pattern); 5244 return true; 5245 } 5246 return false; 5247 }; 5248 5249 typedef MachineCombinerPattern MCP; 5250 5251 switch (Root.getOpcode()) { 5252 default: 5253 assert(false && "Unsupported FP instruction in combiner\n"); 5254 break; 5255 case AArch64::FADDHrr: 5256 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5257 "FADDHrr does not have register operands"); 5258 5259 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 5260 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 5261 break; 5262 case AArch64::FADDSrr: 5263 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5264 "FADDSrr does not have register operands"); 5265 5266 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 5267 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 5268 5269 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 5270 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 5271 break; 5272 case AArch64::FADDDrr: 5273 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 5274 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 5275 5276 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 5277 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 5278 break; 5279 case AArch64::FADDv4f16: 5280 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 5281 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 5282 5283 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 5284 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 5285 break; 5286 case AArch64::FADDv8f16: 5287 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 5288 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 5289 5290 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 5291 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 5292 break; 5293 case AArch64::FADDv2f32: 5294 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 5295 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 5296 5297 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 5298 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 5299 break; 5300 case AArch64::FADDv2f64: 5301 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 5302 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 5303 5304 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 5305 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 5306 break; 5307 case AArch64::FADDv4f32: 5308 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 5309 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 5310 5311 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 5312 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 5313 break; 5314 case AArch64::FSUBHrr: 5315 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 5316 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 5317 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 5318 break; 5319 case AArch64::FSUBSrr: 5320 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 5321 5322 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 5323 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 5324 5325 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 5326 break; 5327 case AArch64::FSUBDrr: 5328 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 5329 5330 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 5331 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 5332 5333 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 5334 break; 5335 case AArch64::FSUBv4f16: 5336 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 5337 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 5338 5339 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 5340 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 5341 break; 5342 case AArch64::FSUBv8f16: 5343 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 5344 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 5345 5346 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 5347 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 5348 break; 5349 case AArch64::FSUBv2f32: 5350 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 5351 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 5352 5353 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 5354 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 5355 break; 5356 case AArch64::FSUBv2f64: 5357 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 5358 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 5359 5360 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 5361 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 5362 break; 5363 case AArch64::FSUBv4f32: 5364 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 5365 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 5366 5367 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 5368 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 5369 break; 5370 } 5371 return Found; 5372 } 5373 5374 static bool getFMULPatterns(MachineInstr &Root, 5375 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5376 MachineBasicBlock &MBB = *Root.getParent(); 5377 bool Found = false; 5378 5379 auto Match = [&](unsigned Opcode, int Operand, 5380 MachineCombinerPattern Pattern) -> bool { 5381 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5382 MachineOperand &MO = Root.getOperand(Operand); 5383 MachineInstr *MI = nullptr; 5384 if (MO.isReg() && MO.getReg().isVirtual()) 5385 MI = MRI.getUniqueVRegDef(MO.getReg()); 5386 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 5387 if (MI && MI->getOpcode() == TargetOpcode::COPY && 5388 MI->getOperand(1).getReg().isVirtual()) 5389 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 5390 if (MI && MI->getOpcode() == Opcode) { 5391 Patterns.push_back(Pattern); 5392 return true; 5393 } 5394 return false; 5395 }; 5396 5397 typedef MachineCombinerPattern MCP; 5398 5399 switch (Root.getOpcode()) { 5400 default: 5401 return false; 5402 case AArch64::FMULv2f32: 5403 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 5404 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 5405 break; 5406 case AArch64::FMULv2f64: 5407 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 5408 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 5409 break; 5410 case AArch64::FMULv4f16: 5411 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 5412 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 5413 break; 5414 case AArch64::FMULv4f32: 5415 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 5416 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 5417 break; 5418 case AArch64::FMULv8f16: 5419 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 5420 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 5421 break; 5422 } 5423 5424 return Found; 5425 } 5426 5427 static bool getFNEGPatterns(MachineInstr &Root, 5428 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5429 unsigned Opc = Root.getOpcode(); 5430 MachineBasicBlock &MBB = *Root.getParent(); 5431 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5432 5433 auto Match = [&](unsigned Opcode, MachineCombinerPattern Pattern) -> bool { 5434 MachineOperand &MO = Root.getOperand(1); 5435 MachineInstr *MI = MRI.getUniqueVRegDef(MO.getReg()); 5436 if (MI != nullptr && (MI->getOpcode() == Opcode) && 5437 MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()) && 5438 Root.getFlag(MachineInstr::MIFlag::FmContract) && 5439 Root.getFlag(MachineInstr::MIFlag::FmNsz) && 5440 MI->getFlag(MachineInstr::MIFlag::FmContract) && 5441 MI->getFlag(MachineInstr::MIFlag::FmNsz)) { 5442 Patterns.push_back(Pattern); 5443 return true; 5444 } 5445 return false; 5446 }; 5447 5448 switch (Opc) { 5449 default: 5450 break; 5451 case AArch64::FNEGDr: 5452 return Match(AArch64::FMADDDrrr, MachineCombinerPattern::FNMADD); 5453 case AArch64::FNEGSr: 5454 return Match(AArch64::FMADDSrrr, MachineCombinerPattern::FNMADD); 5455 } 5456 5457 return false; 5458 } 5459 5460 /// Return true when a code sequence can improve throughput. It 5461 /// should be called only for instructions in loops. 5462 /// \param Pattern - combiner pattern 5463 bool AArch64InstrInfo::isThroughputPattern( 5464 MachineCombinerPattern Pattern) const { 5465 switch (Pattern) { 5466 default: 5467 break; 5468 case MachineCombinerPattern::FMULADDH_OP1: 5469 case MachineCombinerPattern::FMULADDH_OP2: 5470 case MachineCombinerPattern::FMULSUBH_OP1: 5471 case MachineCombinerPattern::FMULSUBH_OP2: 5472 case MachineCombinerPattern::FMULADDS_OP1: 5473 case MachineCombinerPattern::FMULADDS_OP2: 5474 case MachineCombinerPattern::FMULSUBS_OP1: 5475 case MachineCombinerPattern::FMULSUBS_OP2: 5476 case MachineCombinerPattern::FMULADDD_OP1: 5477 case MachineCombinerPattern::FMULADDD_OP2: 5478 case MachineCombinerPattern::FMULSUBD_OP1: 5479 case MachineCombinerPattern::FMULSUBD_OP2: 5480 case MachineCombinerPattern::FNMULSUBH_OP1: 5481 case MachineCombinerPattern::FNMULSUBS_OP1: 5482 case MachineCombinerPattern::FNMULSUBD_OP1: 5483 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5484 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5485 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5486 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5487 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5488 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5489 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5490 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5491 case MachineCombinerPattern::FMLAv4f16_OP2: 5492 case MachineCombinerPattern::FMLAv4f16_OP1: 5493 case MachineCombinerPattern::FMLAv8f16_OP1: 5494 case MachineCombinerPattern::FMLAv8f16_OP2: 5495 case MachineCombinerPattern::FMLAv2f32_OP2: 5496 case MachineCombinerPattern::FMLAv2f32_OP1: 5497 case MachineCombinerPattern::FMLAv2f64_OP1: 5498 case MachineCombinerPattern::FMLAv2f64_OP2: 5499 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5500 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5501 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5502 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5503 case MachineCombinerPattern::FMLAv4f32_OP1: 5504 case MachineCombinerPattern::FMLAv4f32_OP2: 5505 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5506 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5507 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 5508 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5509 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 5510 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5511 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5512 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5513 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5514 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5515 case MachineCombinerPattern::FMLSv4f16_OP1: 5516 case MachineCombinerPattern::FMLSv4f16_OP2: 5517 case MachineCombinerPattern::FMLSv8f16_OP1: 5518 case MachineCombinerPattern::FMLSv8f16_OP2: 5519 case MachineCombinerPattern::FMLSv2f32_OP2: 5520 case MachineCombinerPattern::FMLSv2f64_OP2: 5521 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5522 case MachineCombinerPattern::FMLSv4f32_OP2: 5523 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 5524 case MachineCombinerPattern::FMULv2i32_indexed_OP2: 5525 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 5526 case MachineCombinerPattern::FMULv2i64_indexed_OP2: 5527 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 5528 case MachineCombinerPattern::FMULv4i16_indexed_OP2: 5529 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 5530 case MachineCombinerPattern::FMULv4i32_indexed_OP2: 5531 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 5532 case MachineCombinerPattern::FMULv8i16_indexed_OP2: 5533 case MachineCombinerPattern::MULADDv8i8_OP1: 5534 case MachineCombinerPattern::MULADDv8i8_OP2: 5535 case MachineCombinerPattern::MULADDv16i8_OP1: 5536 case MachineCombinerPattern::MULADDv16i8_OP2: 5537 case MachineCombinerPattern::MULADDv4i16_OP1: 5538 case MachineCombinerPattern::MULADDv4i16_OP2: 5539 case MachineCombinerPattern::MULADDv8i16_OP1: 5540 case MachineCombinerPattern::MULADDv8i16_OP2: 5541 case MachineCombinerPattern::MULADDv2i32_OP1: 5542 case MachineCombinerPattern::MULADDv2i32_OP2: 5543 case MachineCombinerPattern::MULADDv4i32_OP1: 5544 case MachineCombinerPattern::MULADDv4i32_OP2: 5545 case MachineCombinerPattern::MULSUBv8i8_OP1: 5546 case MachineCombinerPattern::MULSUBv8i8_OP2: 5547 case MachineCombinerPattern::MULSUBv16i8_OP1: 5548 case MachineCombinerPattern::MULSUBv16i8_OP2: 5549 case MachineCombinerPattern::MULSUBv4i16_OP1: 5550 case MachineCombinerPattern::MULSUBv4i16_OP2: 5551 case MachineCombinerPattern::MULSUBv8i16_OP1: 5552 case MachineCombinerPattern::MULSUBv8i16_OP2: 5553 case MachineCombinerPattern::MULSUBv2i32_OP1: 5554 case MachineCombinerPattern::MULSUBv2i32_OP2: 5555 case MachineCombinerPattern::MULSUBv4i32_OP1: 5556 case MachineCombinerPattern::MULSUBv4i32_OP2: 5557 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5558 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5559 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5560 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5561 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5562 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5563 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5564 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5565 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5566 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5567 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5568 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5569 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5570 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5571 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5572 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5573 return true; 5574 } // end switch (Pattern) 5575 return false; 5576 } 5577 5578 /// Find other MI combine patterns. 5579 static bool getMiscPatterns(MachineInstr &Root, 5580 SmallVectorImpl<MachineCombinerPattern> &Patterns) 5581 { 5582 // A - (B + C) ==> (A - B) - C or (A - C) - B 5583 unsigned Opc = Root.getOpcode(); 5584 MachineBasicBlock &MBB = *Root.getParent(); 5585 5586 switch (Opc) { 5587 case AArch64::SUBWrr: 5588 case AArch64::SUBSWrr: 5589 case AArch64::SUBXrr: 5590 case AArch64::SUBSXrr: 5591 // Found candidate root. 5592 break; 5593 default: 5594 return false; 5595 } 5596 5597 if (isCombineInstrSettingFlag(Opc) && 5598 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 5599 return false; 5600 5601 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 5602 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 5603 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 5604 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 5605 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); 5606 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); 5607 return true; 5608 } 5609 5610 return false; 5611 } 5612 5613 /// Return true when there is potentially a faster code sequence for an 5614 /// instruction chain ending in \p Root. All potential patterns are listed in 5615 /// the \p Pattern vector. Pattern should be sorted in priority order since the 5616 /// pattern evaluator stops checking as soon as it finds a faster sequence. 5617 5618 bool AArch64InstrInfo::getMachineCombinerPatterns( 5619 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 5620 bool DoRegPressureReduce) const { 5621 // Integer patterns 5622 if (getMaddPatterns(Root, Patterns)) 5623 return true; 5624 // Floating point patterns 5625 if (getFMULPatterns(Root, Patterns)) 5626 return true; 5627 if (getFMAPatterns(Root, Patterns)) 5628 return true; 5629 if (getFNEGPatterns(Root, Patterns)) 5630 return true; 5631 5632 // Other patterns 5633 if (getMiscPatterns(Root, Patterns)) 5634 return true; 5635 5636 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 5637 DoRegPressureReduce); 5638 } 5639 5640 enum class FMAInstKind { Default, Indexed, Accumulator }; 5641 /// genFusedMultiply - Generate fused multiply instructions. 5642 /// This function supports both integer and floating point instructions. 5643 /// A typical example: 5644 /// F|MUL I=A,B,0 5645 /// F|ADD R,I,C 5646 /// ==> F|MADD R,A,B,C 5647 /// \param MF Containing MachineFunction 5648 /// \param MRI Register information 5649 /// \param TII Target information 5650 /// \param Root is the F|ADD instruction 5651 /// \param [out] InsInstrs is a vector of machine instructions and will 5652 /// contain the generated madd instruction 5653 /// \param IdxMulOpd is index of operand in Root that is the result of 5654 /// the F|MUL. In the example above IdxMulOpd is 1. 5655 /// \param MaddOpc the opcode fo the f|madd instruction 5656 /// \param RC Register class of operands 5657 /// \param kind of fma instruction (addressing mode) to be generated 5658 /// \param ReplacedAddend is the result register from the instruction 5659 /// replacing the non-combined operand, if any. 5660 static MachineInstr * 5661 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5662 const TargetInstrInfo *TII, MachineInstr &Root, 5663 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5664 unsigned MaddOpc, const TargetRegisterClass *RC, 5665 FMAInstKind kind = FMAInstKind::Default, 5666 const Register *ReplacedAddend = nullptr) { 5667 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5668 5669 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5670 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5671 Register ResultReg = Root.getOperand(0).getReg(); 5672 Register SrcReg0 = MUL->getOperand(1).getReg(); 5673 bool Src0IsKill = MUL->getOperand(1).isKill(); 5674 Register SrcReg1 = MUL->getOperand(2).getReg(); 5675 bool Src1IsKill = MUL->getOperand(2).isKill(); 5676 5677 Register SrcReg2; 5678 bool Src2IsKill; 5679 if (ReplacedAddend) { 5680 // If we just generated a new addend, we must be it's only use. 5681 SrcReg2 = *ReplacedAddend; 5682 Src2IsKill = true; 5683 } else { 5684 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5685 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5686 } 5687 5688 if (ResultReg.isVirtual()) 5689 MRI.constrainRegClass(ResultReg, RC); 5690 if (SrcReg0.isVirtual()) 5691 MRI.constrainRegClass(SrcReg0, RC); 5692 if (SrcReg1.isVirtual()) 5693 MRI.constrainRegClass(SrcReg1, RC); 5694 if (SrcReg2.isVirtual()) 5695 MRI.constrainRegClass(SrcReg2, RC); 5696 5697 MachineInstrBuilder MIB; 5698 if (kind == FMAInstKind::Default) 5699 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5700 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5701 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5702 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5703 else if (kind == FMAInstKind::Indexed) 5704 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5705 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5706 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5707 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5708 .addImm(MUL->getOperand(3).getImm()); 5709 else if (kind == FMAInstKind::Accumulator) 5710 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5711 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5712 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5713 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5714 else 5715 assert(false && "Invalid FMA instruction kind \n"); 5716 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5717 InsInstrs.push_back(MIB); 5718 return MUL; 5719 } 5720 5721 static MachineInstr * 5722 genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI, 5723 const TargetInstrInfo *TII, MachineInstr &Root, 5724 SmallVectorImpl<MachineInstr *> &InsInstrs) { 5725 MachineInstr *MAD = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); 5726 5727 unsigned Opc = 0; 5728 const TargetRegisterClass *RC = MRI.getRegClass(MAD->getOperand(0).getReg()); 5729 if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 5730 Opc = AArch64::FNMADDSrrr; 5731 else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) 5732 Opc = AArch64::FNMADDDrrr; 5733 else 5734 return nullptr; 5735 5736 Register ResultReg = Root.getOperand(0).getReg(); 5737 Register SrcReg0 = MAD->getOperand(1).getReg(); 5738 Register SrcReg1 = MAD->getOperand(2).getReg(); 5739 Register SrcReg2 = MAD->getOperand(3).getReg(); 5740 bool Src0IsKill = MAD->getOperand(1).isKill(); 5741 bool Src1IsKill = MAD->getOperand(2).isKill(); 5742 bool Src2IsKill = MAD->getOperand(3).isKill(); 5743 if (ResultReg.isVirtual()) 5744 MRI.constrainRegClass(ResultReg, RC); 5745 if (SrcReg0.isVirtual()) 5746 MRI.constrainRegClass(SrcReg0, RC); 5747 if (SrcReg1.isVirtual()) 5748 MRI.constrainRegClass(SrcReg1, RC); 5749 if (SrcReg2.isVirtual()) 5750 MRI.constrainRegClass(SrcReg2, RC); 5751 5752 MachineInstrBuilder MIB = 5753 BuildMI(MF, MIMetadata(Root), TII->get(Opc), ResultReg) 5754 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5755 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5756 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5757 InsInstrs.push_back(MIB); 5758 5759 return MAD; 5760 } 5761 5762 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 5763 static MachineInstr * 5764 genIndexedMultiply(MachineInstr &Root, 5765 SmallVectorImpl<MachineInstr *> &InsInstrs, 5766 unsigned IdxDupOp, unsigned MulOpc, 5767 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 5768 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 5769 "Invalid index of FMUL operand"); 5770 5771 MachineFunction &MF = *Root.getMF(); 5772 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5773 5774 MachineInstr *Dup = 5775 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 5776 5777 if (Dup->getOpcode() == TargetOpcode::COPY) 5778 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 5779 5780 Register DupSrcReg = Dup->getOperand(1).getReg(); 5781 MRI.clearKillFlags(DupSrcReg); 5782 MRI.constrainRegClass(DupSrcReg, RC); 5783 5784 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 5785 5786 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 5787 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 5788 5789 Register ResultReg = Root.getOperand(0).getReg(); 5790 5791 MachineInstrBuilder MIB; 5792 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 5793 .add(MulOp) 5794 .addReg(DupSrcReg) 5795 .addImm(DupSrcLane); 5796 5797 InsInstrs.push_back(MIB); 5798 return &Root; 5799 } 5800 5801 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5802 /// instructions. 5803 /// 5804 /// \see genFusedMultiply 5805 static MachineInstr *genFusedMultiplyAcc( 5806 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5807 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5808 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5809 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5810 FMAInstKind::Accumulator); 5811 } 5812 5813 /// genNeg - Helper to generate an intermediate negation of the second operand 5814 /// of Root 5815 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5816 const TargetInstrInfo *TII, MachineInstr &Root, 5817 SmallVectorImpl<MachineInstr *> &InsInstrs, 5818 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5819 unsigned MnegOpc, const TargetRegisterClass *RC) { 5820 Register NewVR = MRI.createVirtualRegister(RC); 5821 MachineInstrBuilder MIB = 5822 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 5823 .add(Root.getOperand(2)); 5824 InsInstrs.push_back(MIB); 5825 5826 assert(InstrIdxForVirtReg.empty()); 5827 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5828 5829 return NewVR; 5830 } 5831 5832 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5833 /// instructions with an additional negation of the accumulator 5834 static MachineInstr *genFusedMultiplyAccNeg( 5835 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5836 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5837 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5838 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5839 assert(IdxMulOpd == 1); 5840 5841 Register NewVR = 5842 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5843 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5844 FMAInstKind::Accumulator, &NewVR); 5845 } 5846 5847 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5848 /// instructions. 5849 /// 5850 /// \see genFusedMultiply 5851 static MachineInstr *genFusedMultiplyIdx( 5852 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5853 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5854 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5855 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5856 FMAInstKind::Indexed); 5857 } 5858 5859 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5860 /// instructions with an additional negation of the accumulator 5861 static MachineInstr *genFusedMultiplyIdxNeg( 5862 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5863 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5864 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5865 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5866 assert(IdxMulOpd == 1); 5867 5868 Register NewVR = 5869 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5870 5871 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5872 FMAInstKind::Indexed, &NewVR); 5873 } 5874 5875 /// genMaddR - Generate madd instruction and combine mul and add using 5876 /// an extra virtual register 5877 /// Example - an ADD intermediate needs to be stored in a register: 5878 /// MUL I=A,B,0 5879 /// ADD R,I,Imm 5880 /// ==> ORR V, ZR, Imm 5881 /// ==> MADD R,A,B,V 5882 /// \param MF Containing MachineFunction 5883 /// \param MRI Register information 5884 /// \param TII Target information 5885 /// \param Root is the ADD instruction 5886 /// \param [out] InsInstrs is a vector of machine instructions and will 5887 /// contain the generated madd instruction 5888 /// \param IdxMulOpd is index of operand in Root that is the result of 5889 /// the MUL. In the example above IdxMulOpd is 1. 5890 /// \param MaddOpc the opcode fo the madd instruction 5891 /// \param VR is a virtual register that holds the value of an ADD operand 5892 /// (V in the example above). 5893 /// \param RC Register class of operands 5894 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5895 const TargetInstrInfo *TII, MachineInstr &Root, 5896 SmallVectorImpl<MachineInstr *> &InsInstrs, 5897 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5898 const TargetRegisterClass *RC) { 5899 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5900 5901 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5902 Register ResultReg = Root.getOperand(0).getReg(); 5903 Register SrcReg0 = MUL->getOperand(1).getReg(); 5904 bool Src0IsKill = MUL->getOperand(1).isKill(); 5905 Register SrcReg1 = MUL->getOperand(2).getReg(); 5906 bool Src1IsKill = MUL->getOperand(2).isKill(); 5907 5908 if (ResultReg.isVirtual()) 5909 MRI.constrainRegClass(ResultReg, RC); 5910 if (SrcReg0.isVirtual()) 5911 MRI.constrainRegClass(SrcReg0, RC); 5912 if (SrcReg1.isVirtual()) 5913 MRI.constrainRegClass(SrcReg1, RC); 5914 if (Register::isVirtualRegister(VR)) 5915 MRI.constrainRegClass(VR, RC); 5916 5917 MachineInstrBuilder MIB = 5918 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5919 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5920 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5921 .addReg(VR); 5922 // Insert the MADD 5923 InsInstrs.push_back(MIB); 5924 return MUL; 5925 } 5926 5927 /// Do the following transformation 5928 /// A - (B + C) ==> (A - B) - C 5929 /// A - (B + C) ==> (A - C) - B 5930 static void 5931 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 5932 const TargetInstrInfo *TII, MachineInstr &Root, 5933 SmallVectorImpl<MachineInstr *> &InsInstrs, 5934 SmallVectorImpl<MachineInstr *> &DelInstrs, 5935 unsigned IdxOpd1, 5936 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 5937 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 5938 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 5939 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 5940 5941 Register ResultReg = Root.getOperand(0).getReg(); 5942 Register RegA = Root.getOperand(1).getReg(); 5943 bool RegAIsKill = Root.getOperand(1).isKill(); 5944 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 5945 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 5946 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 5947 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 5948 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 5949 5950 unsigned Opcode = Root.getOpcode(); 5951 if (Opcode == AArch64::SUBSWrr) 5952 Opcode = AArch64::SUBWrr; 5953 else if (Opcode == AArch64::SUBSXrr) 5954 Opcode = AArch64::SUBXrr; 5955 else 5956 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 5957 "Unexpected instruction opcode."); 5958 5959 MachineInstrBuilder MIB1 = 5960 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 5961 .addReg(RegA, getKillRegState(RegAIsKill)) 5962 .addReg(RegB, getKillRegState(RegBIsKill)); 5963 MachineInstrBuilder MIB2 = 5964 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 5965 .addReg(NewVR, getKillRegState(true)) 5966 .addReg(RegC, getKillRegState(RegCIsKill)); 5967 5968 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5969 InsInstrs.push_back(MIB1); 5970 InsInstrs.push_back(MIB2); 5971 DelInstrs.push_back(AddMI); 5972 } 5973 5974 /// When getMachineCombinerPatterns() finds potential patterns, 5975 /// this function generates the instructions that could replace the 5976 /// original code sequence 5977 void AArch64InstrInfo::genAlternativeCodeSequence( 5978 MachineInstr &Root, MachineCombinerPattern Pattern, 5979 SmallVectorImpl<MachineInstr *> &InsInstrs, 5980 SmallVectorImpl<MachineInstr *> &DelInstrs, 5981 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5982 MachineBasicBlock &MBB = *Root.getParent(); 5983 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5984 MachineFunction &MF = *MBB.getParent(); 5985 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5986 5987 MachineInstr *MUL = nullptr; 5988 const TargetRegisterClass *RC; 5989 unsigned Opc; 5990 switch (Pattern) { 5991 default: 5992 // Reassociate instructions. 5993 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5994 DelInstrs, InstrIdxForVirtReg); 5995 return; 5996 case MachineCombinerPattern::SUBADD_OP1: 5997 // A - (B + C) 5998 // ==> (A - B) - C 5999 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 6000 InstrIdxForVirtReg); 6001 break; 6002 case MachineCombinerPattern::SUBADD_OP2: 6003 // A - (B + C) 6004 // ==> (A - C) - B 6005 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 6006 InstrIdxForVirtReg); 6007 break; 6008 case MachineCombinerPattern::MULADDW_OP1: 6009 case MachineCombinerPattern::MULADDX_OP1: 6010 // MUL I=A,B,0 6011 // ADD R,I,C 6012 // ==> MADD R,A,B,C 6013 // --- Create(MADD); 6014 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 6015 Opc = AArch64::MADDWrrr; 6016 RC = &AArch64::GPR32RegClass; 6017 } else { 6018 Opc = AArch64::MADDXrrr; 6019 RC = &AArch64::GPR64RegClass; 6020 } 6021 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6022 break; 6023 case MachineCombinerPattern::MULADDW_OP2: 6024 case MachineCombinerPattern::MULADDX_OP2: 6025 // MUL I=A,B,0 6026 // ADD R,C,I 6027 // ==> MADD R,A,B,C 6028 // --- Create(MADD); 6029 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 6030 Opc = AArch64::MADDWrrr; 6031 RC = &AArch64::GPR32RegClass; 6032 } else { 6033 Opc = AArch64::MADDXrrr; 6034 RC = &AArch64::GPR64RegClass; 6035 } 6036 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6037 break; 6038 case MachineCombinerPattern::MULADDWI_OP1: 6039 case MachineCombinerPattern::MULADDXI_OP1: { 6040 // MUL I=A,B,0 6041 // ADD R,I,Imm 6042 // ==> MOV V, Imm 6043 // ==> MADD R,A,B,V 6044 // --- Create(MADD); 6045 const TargetRegisterClass *OrrRC; 6046 unsigned BitSize, OrrOpc, ZeroReg; 6047 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 6048 OrrOpc = AArch64::ORRWri; 6049 OrrRC = &AArch64::GPR32spRegClass; 6050 BitSize = 32; 6051 ZeroReg = AArch64::WZR; 6052 Opc = AArch64::MADDWrrr; 6053 RC = &AArch64::GPR32RegClass; 6054 } else { 6055 OrrOpc = AArch64::ORRXri; 6056 OrrRC = &AArch64::GPR64spRegClass; 6057 BitSize = 64; 6058 ZeroReg = AArch64::XZR; 6059 Opc = AArch64::MADDXrrr; 6060 RC = &AArch64::GPR64RegClass; 6061 } 6062 Register NewVR = MRI.createVirtualRegister(OrrRC); 6063 uint64_t Imm = Root.getOperand(2).getImm(); 6064 6065 if (Root.getOperand(3).isImm()) { 6066 unsigned Val = Root.getOperand(3).getImm(); 6067 Imm = Imm << Val; 6068 } 6069 uint64_t UImm = SignExtend64(Imm, BitSize); 6070 // The immediate can be composed via a single instruction. 6071 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6072 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 6073 if (Insn.size() != 1) 6074 return; 6075 auto MovI = Insn.begin(); 6076 MachineInstrBuilder MIB1; 6077 // MOV is an alias for one of three instructions: movz, movn, and orr. 6078 if (MovI->Opcode == OrrOpc) 6079 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 6080 .addReg(ZeroReg) 6081 .addImm(MovI->Op2); 6082 else { 6083 if (BitSize == 32) 6084 assert((MovI->Opcode == AArch64::MOVNWi || 6085 MovI->Opcode == AArch64::MOVZWi) && 6086 "Expected opcode"); 6087 else 6088 assert((MovI->Opcode == AArch64::MOVNXi || 6089 MovI->Opcode == AArch64::MOVZXi) && 6090 "Expected opcode"); 6091 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 6092 .addImm(MovI->Op1) 6093 .addImm(MovI->Op2); 6094 } 6095 InsInstrs.push_back(MIB1); 6096 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6097 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6098 break; 6099 } 6100 case MachineCombinerPattern::MULSUBW_OP1: 6101 case MachineCombinerPattern::MULSUBX_OP1: { 6102 // MUL I=A,B,0 6103 // SUB R,I, C 6104 // ==> SUB V, 0, C 6105 // ==> MADD R,A,B,V // = -C + A*B 6106 // --- Create(MADD); 6107 const TargetRegisterClass *SubRC; 6108 unsigned SubOpc, ZeroReg; 6109 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 6110 SubOpc = AArch64::SUBWrr; 6111 SubRC = &AArch64::GPR32spRegClass; 6112 ZeroReg = AArch64::WZR; 6113 Opc = AArch64::MADDWrrr; 6114 RC = &AArch64::GPR32RegClass; 6115 } else { 6116 SubOpc = AArch64::SUBXrr; 6117 SubRC = &AArch64::GPR64spRegClass; 6118 ZeroReg = AArch64::XZR; 6119 Opc = AArch64::MADDXrrr; 6120 RC = &AArch64::GPR64RegClass; 6121 } 6122 Register NewVR = MRI.createVirtualRegister(SubRC); 6123 // SUB NewVR, 0, C 6124 MachineInstrBuilder MIB1 = 6125 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 6126 .addReg(ZeroReg) 6127 .add(Root.getOperand(2)); 6128 InsInstrs.push_back(MIB1); 6129 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6130 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6131 break; 6132 } 6133 case MachineCombinerPattern::MULSUBW_OP2: 6134 case MachineCombinerPattern::MULSUBX_OP2: 6135 // MUL I=A,B,0 6136 // SUB R,C,I 6137 // ==> MSUB R,A,B,C (computes C - A*B) 6138 // --- Create(MSUB); 6139 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 6140 Opc = AArch64::MSUBWrrr; 6141 RC = &AArch64::GPR32RegClass; 6142 } else { 6143 Opc = AArch64::MSUBXrrr; 6144 RC = &AArch64::GPR64RegClass; 6145 } 6146 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6147 break; 6148 case MachineCombinerPattern::MULSUBWI_OP1: 6149 case MachineCombinerPattern::MULSUBXI_OP1: { 6150 // MUL I=A,B,0 6151 // SUB R,I, Imm 6152 // ==> MOV V, -Imm 6153 // ==> MADD R,A,B,V // = -Imm + A*B 6154 // --- Create(MADD); 6155 const TargetRegisterClass *OrrRC; 6156 unsigned BitSize, OrrOpc, ZeroReg; 6157 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 6158 OrrOpc = AArch64::ORRWri; 6159 OrrRC = &AArch64::GPR32spRegClass; 6160 BitSize = 32; 6161 ZeroReg = AArch64::WZR; 6162 Opc = AArch64::MADDWrrr; 6163 RC = &AArch64::GPR32RegClass; 6164 } else { 6165 OrrOpc = AArch64::ORRXri; 6166 OrrRC = &AArch64::GPR64spRegClass; 6167 BitSize = 64; 6168 ZeroReg = AArch64::XZR; 6169 Opc = AArch64::MADDXrrr; 6170 RC = &AArch64::GPR64RegClass; 6171 } 6172 Register NewVR = MRI.createVirtualRegister(OrrRC); 6173 uint64_t Imm = Root.getOperand(2).getImm(); 6174 if (Root.getOperand(3).isImm()) { 6175 unsigned Val = Root.getOperand(3).getImm(); 6176 Imm = Imm << Val; 6177 } 6178 uint64_t UImm = SignExtend64(-Imm, BitSize); 6179 // The immediate can be composed via a single instruction. 6180 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6181 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 6182 if (Insn.size() != 1) 6183 return; 6184 auto MovI = Insn.begin(); 6185 MachineInstrBuilder MIB1; 6186 // MOV is an alias for one of three instructions: movz, movn, and orr. 6187 if (MovI->Opcode == OrrOpc) 6188 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 6189 .addReg(ZeroReg) 6190 .addImm(MovI->Op2); 6191 else { 6192 if (BitSize == 32) 6193 assert((MovI->Opcode == AArch64::MOVNWi || 6194 MovI->Opcode == AArch64::MOVZWi) && 6195 "Expected opcode"); 6196 else 6197 assert((MovI->Opcode == AArch64::MOVNXi || 6198 MovI->Opcode == AArch64::MOVZXi) && 6199 "Expected opcode"); 6200 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 6201 .addImm(MovI->Op1) 6202 .addImm(MovI->Op2); 6203 } 6204 InsInstrs.push_back(MIB1); 6205 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6206 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6207 break; 6208 } 6209 6210 case MachineCombinerPattern::MULADDv8i8_OP1: 6211 Opc = AArch64::MLAv8i8; 6212 RC = &AArch64::FPR64RegClass; 6213 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6214 break; 6215 case MachineCombinerPattern::MULADDv8i8_OP2: 6216 Opc = AArch64::MLAv8i8; 6217 RC = &AArch64::FPR64RegClass; 6218 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6219 break; 6220 case MachineCombinerPattern::MULADDv16i8_OP1: 6221 Opc = AArch64::MLAv16i8; 6222 RC = &AArch64::FPR128RegClass; 6223 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6224 break; 6225 case MachineCombinerPattern::MULADDv16i8_OP2: 6226 Opc = AArch64::MLAv16i8; 6227 RC = &AArch64::FPR128RegClass; 6228 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6229 break; 6230 case MachineCombinerPattern::MULADDv4i16_OP1: 6231 Opc = AArch64::MLAv4i16; 6232 RC = &AArch64::FPR64RegClass; 6233 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6234 break; 6235 case MachineCombinerPattern::MULADDv4i16_OP2: 6236 Opc = AArch64::MLAv4i16; 6237 RC = &AArch64::FPR64RegClass; 6238 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6239 break; 6240 case MachineCombinerPattern::MULADDv8i16_OP1: 6241 Opc = AArch64::MLAv8i16; 6242 RC = &AArch64::FPR128RegClass; 6243 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6244 break; 6245 case MachineCombinerPattern::MULADDv8i16_OP2: 6246 Opc = AArch64::MLAv8i16; 6247 RC = &AArch64::FPR128RegClass; 6248 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6249 break; 6250 case MachineCombinerPattern::MULADDv2i32_OP1: 6251 Opc = AArch64::MLAv2i32; 6252 RC = &AArch64::FPR64RegClass; 6253 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6254 break; 6255 case MachineCombinerPattern::MULADDv2i32_OP2: 6256 Opc = AArch64::MLAv2i32; 6257 RC = &AArch64::FPR64RegClass; 6258 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6259 break; 6260 case MachineCombinerPattern::MULADDv4i32_OP1: 6261 Opc = AArch64::MLAv4i32; 6262 RC = &AArch64::FPR128RegClass; 6263 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6264 break; 6265 case MachineCombinerPattern::MULADDv4i32_OP2: 6266 Opc = AArch64::MLAv4i32; 6267 RC = &AArch64::FPR128RegClass; 6268 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6269 break; 6270 6271 case MachineCombinerPattern::MULSUBv8i8_OP1: 6272 Opc = AArch64::MLAv8i8; 6273 RC = &AArch64::FPR64RegClass; 6274 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6275 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 6276 RC); 6277 break; 6278 case MachineCombinerPattern::MULSUBv8i8_OP2: 6279 Opc = AArch64::MLSv8i8; 6280 RC = &AArch64::FPR64RegClass; 6281 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6282 break; 6283 case MachineCombinerPattern::MULSUBv16i8_OP1: 6284 Opc = AArch64::MLAv16i8; 6285 RC = &AArch64::FPR128RegClass; 6286 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6287 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 6288 RC); 6289 break; 6290 case MachineCombinerPattern::MULSUBv16i8_OP2: 6291 Opc = AArch64::MLSv16i8; 6292 RC = &AArch64::FPR128RegClass; 6293 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6294 break; 6295 case MachineCombinerPattern::MULSUBv4i16_OP1: 6296 Opc = AArch64::MLAv4i16; 6297 RC = &AArch64::FPR64RegClass; 6298 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6299 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6300 RC); 6301 break; 6302 case MachineCombinerPattern::MULSUBv4i16_OP2: 6303 Opc = AArch64::MLSv4i16; 6304 RC = &AArch64::FPR64RegClass; 6305 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6306 break; 6307 case MachineCombinerPattern::MULSUBv8i16_OP1: 6308 Opc = AArch64::MLAv8i16; 6309 RC = &AArch64::FPR128RegClass; 6310 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6311 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6312 RC); 6313 break; 6314 case MachineCombinerPattern::MULSUBv8i16_OP2: 6315 Opc = AArch64::MLSv8i16; 6316 RC = &AArch64::FPR128RegClass; 6317 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6318 break; 6319 case MachineCombinerPattern::MULSUBv2i32_OP1: 6320 Opc = AArch64::MLAv2i32; 6321 RC = &AArch64::FPR64RegClass; 6322 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6323 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6324 RC); 6325 break; 6326 case MachineCombinerPattern::MULSUBv2i32_OP2: 6327 Opc = AArch64::MLSv2i32; 6328 RC = &AArch64::FPR64RegClass; 6329 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6330 break; 6331 case MachineCombinerPattern::MULSUBv4i32_OP1: 6332 Opc = AArch64::MLAv4i32; 6333 RC = &AArch64::FPR128RegClass; 6334 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6335 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6336 RC); 6337 break; 6338 case MachineCombinerPattern::MULSUBv4i32_OP2: 6339 Opc = AArch64::MLSv4i32; 6340 RC = &AArch64::FPR128RegClass; 6341 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6342 break; 6343 6344 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6345 Opc = AArch64::MLAv4i16_indexed; 6346 RC = &AArch64::FPR64RegClass; 6347 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6348 break; 6349 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6350 Opc = AArch64::MLAv4i16_indexed; 6351 RC = &AArch64::FPR64RegClass; 6352 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6353 break; 6354 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6355 Opc = AArch64::MLAv8i16_indexed; 6356 RC = &AArch64::FPR128RegClass; 6357 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6358 break; 6359 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6360 Opc = AArch64::MLAv8i16_indexed; 6361 RC = &AArch64::FPR128RegClass; 6362 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6363 break; 6364 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6365 Opc = AArch64::MLAv2i32_indexed; 6366 RC = &AArch64::FPR64RegClass; 6367 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6368 break; 6369 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6370 Opc = AArch64::MLAv2i32_indexed; 6371 RC = &AArch64::FPR64RegClass; 6372 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6373 break; 6374 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6375 Opc = AArch64::MLAv4i32_indexed; 6376 RC = &AArch64::FPR128RegClass; 6377 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6378 break; 6379 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6380 Opc = AArch64::MLAv4i32_indexed; 6381 RC = &AArch64::FPR128RegClass; 6382 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6383 break; 6384 6385 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6386 Opc = AArch64::MLAv4i16_indexed; 6387 RC = &AArch64::FPR64RegClass; 6388 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6389 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6390 RC); 6391 break; 6392 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6393 Opc = AArch64::MLSv4i16_indexed; 6394 RC = &AArch64::FPR64RegClass; 6395 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6396 break; 6397 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6398 Opc = AArch64::MLAv8i16_indexed; 6399 RC = &AArch64::FPR128RegClass; 6400 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6401 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6402 RC); 6403 break; 6404 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6405 Opc = AArch64::MLSv8i16_indexed; 6406 RC = &AArch64::FPR128RegClass; 6407 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6408 break; 6409 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6410 Opc = AArch64::MLAv2i32_indexed; 6411 RC = &AArch64::FPR64RegClass; 6412 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6413 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6414 RC); 6415 break; 6416 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6417 Opc = AArch64::MLSv2i32_indexed; 6418 RC = &AArch64::FPR64RegClass; 6419 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6420 break; 6421 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6422 Opc = AArch64::MLAv4i32_indexed; 6423 RC = &AArch64::FPR128RegClass; 6424 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6425 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6426 RC); 6427 break; 6428 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6429 Opc = AArch64::MLSv4i32_indexed; 6430 RC = &AArch64::FPR128RegClass; 6431 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6432 break; 6433 6434 // Floating Point Support 6435 case MachineCombinerPattern::FMULADDH_OP1: 6436 Opc = AArch64::FMADDHrrr; 6437 RC = &AArch64::FPR16RegClass; 6438 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6439 break; 6440 case MachineCombinerPattern::FMULADDS_OP1: 6441 Opc = AArch64::FMADDSrrr; 6442 RC = &AArch64::FPR32RegClass; 6443 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6444 break; 6445 case MachineCombinerPattern::FMULADDD_OP1: 6446 Opc = AArch64::FMADDDrrr; 6447 RC = &AArch64::FPR64RegClass; 6448 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6449 break; 6450 6451 case MachineCombinerPattern::FMULADDH_OP2: 6452 Opc = AArch64::FMADDHrrr; 6453 RC = &AArch64::FPR16RegClass; 6454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6455 break; 6456 case MachineCombinerPattern::FMULADDS_OP2: 6457 Opc = AArch64::FMADDSrrr; 6458 RC = &AArch64::FPR32RegClass; 6459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6460 break; 6461 case MachineCombinerPattern::FMULADDD_OP2: 6462 Opc = AArch64::FMADDDrrr; 6463 RC = &AArch64::FPR64RegClass; 6464 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6465 break; 6466 6467 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6468 Opc = AArch64::FMLAv1i32_indexed; 6469 RC = &AArch64::FPR32RegClass; 6470 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6471 FMAInstKind::Indexed); 6472 break; 6473 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6474 Opc = AArch64::FMLAv1i32_indexed; 6475 RC = &AArch64::FPR32RegClass; 6476 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6477 FMAInstKind::Indexed); 6478 break; 6479 6480 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6481 Opc = AArch64::FMLAv1i64_indexed; 6482 RC = &AArch64::FPR64RegClass; 6483 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6484 FMAInstKind::Indexed); 6485 break; 6486 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6487 Opc = AArch64::FMLAv1i64_indexed; 6488 RC = &AArch64::FPR64RegClass; 6489 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6490 FMAInstKind::Indexed); 6491 break; 6492 6493 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6494 RC = &AArch64::FPR64RegClass; 6495 Opc = AArch64::FMLAv4i16_indexed; 6496 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6497 FMAInstKind::Indexed); 6498 break; 6499 case MachineCombinerPattern::FMLAv4f16_OP1: 6500 RC = &AArch64::FPR64RegClass; 6501 Opc = AArch64::FMLAv4f16; 6502 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6503 FMAInstKind::Accumulator); 6504 break; 6505 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6506 RC = &AArch64::FPR64RegClass; 6507 Opc = AArch64::FMLAv4i16_indexed; 6508 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6509 FMAInstKind::Indexed); 6510 break; 6511 case MachineCombinerPattern::FMLAv4f16_OP2: 6512 RC = &AArch64::FPR64RegClass; 6513 Opc = AArch64::FMLAv4f16; 6514 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6515 FMAInstKind::Accumulator); 6516 break; 6517 6518 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6519 case MachineCombinerPattern::FMLAv2f32_OP1: 6520 RC = &AArch64::FPR64RegClass; 6521 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 6522 Opc = AArch64::FMLAv2i32_indexed; 6523 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6524 FMAInstKind::Indexed); 6525 } else { 6526 Opc = AArch64::FMLAv2f32; 6527 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6528 FMAInstKind::Accumulator); 6529 } 6530 break; 6531 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6532 case MachineCombinerPattern::FMLAv2f32_OP2: 6533 RC = &AArch64::FPR64RegClass; 6534 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 6535 Opc = AArch64::FMLAv2i32_indexed; 6536 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6537 FMAInstKind::Indexed); 6538 } else { 6539 Opc = AArch64::FMLAv2f32; 6540 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6541 FMAInstKind::Accumulator); 6542 } 6543 break; 6544 6545 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6546 RC = &AArch64::FPR128RegClass; 6547 Opc = AArch64::FMLAv8i16_indexed; 6548 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6549 FMAInstKind::Indexed); 6550 break; 6551 case MachineCombinerPattern::FMLAv8f16_OP1: 6552 RC = &AArch64::FPR128RegClass; 6553 Opc = AArch64::FMLAv8f16; 6554 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6555 FMAInstKind::Accumulator); 6556 break; 6557 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6558 RC = &AArch64::FPR128RegClass; 6559 Opc = AArch64::FMLAv8i16_indexed; 6560 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6561 FMAInstKind::Indexed); 6562 break; 6563 case MachineCombinerPattern::FMLAv8f16_OP2: 6564 RC = &AArch64::FPR128RegClass; 6565 Opc = AArch64::FMLAv8f16; 6566 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6567 FMAInstKind::Accumulator); 6568 break; 6569 6570 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6571 case MachineCombinerPattern::FMLAv2f64_OP1: 6572 RC = &AArch64::FPR128RegClass; 6573 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 6574 Opc = AArch64::FMLAv2i64_indexed; 6575 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6576 FMAInstKind::Indexed); 6577 } else { 6578 Opc = AArch64::FMLAv2f64; 6579 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6580 FMAInstKind::Accumulator); 6581 } 6582 break; 6583 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6584 case MachineCombinerPattern::FMLAv2f64_OP2: 6585 RC = &AArch64::FPR128RegClass; 6586 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 6587 Opc = AArch64::FMLAv2i64_indexed; 6588 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6589 FMAInstKind::Indexed); 6590 } else { 6591 Opc = AArch64::FMLAv2f64; 6592 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6593 FMAInstKind::Accumulator); 6594 } 6595 break; 6596 6597 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6598 case MachineCombinerPattern::FMLAv4f32_OP1: 6599 RC = &AArch64::FPR128RegClass; 6600 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 6601 Opc = AArch64::FMLAv4i32_indexed; 6602 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6603 FMAInstKind::Indexed); 6604 } else { 6605 Opc = AArch64::FMLAv4f32; 6606 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6607 FMAInstKind::Accumulator); 6608 } 6609 break; 6610 6611 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6612 case MachineCombinerPattern::FMLAv4f32_OP2: 6613 RC = &AArch64::FPR128RegClass; 6614 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 6615 Opc = AArch64::FMLAv4i32_indexed; 6616 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6617 FMAInstKind::Indexed); 6618 } else { 6619 Opc = AArch64::FMLAv4f32; 6620 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6621 FMAInstKind::Accumulator); 6622 } 6623 break; 6624 6625 case MachineCombinerPattern::FMULSUBH_OP1: 6626 Opc = AArch64::FNMSUBHrrr; 6627 RC = &AArch64::FPR16RegClass; 6628 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6629 break; 6630 case MachineCombinerPattern::FMULSUBS_OP1: 6631 Opc = AArch64::FNMSUBSrrr; 6632 RC = &AArch64::FPR32RegClass; 6633 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6634 break; 6635 case MachineCombinerPattern::FMULSUBD_OP1: 6636 Opc = AArch64::FNMSUBDrrr; 6637 RC = &AArch64::FPR64RegClass; 6638 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6639 break; 6640 6641 case MachineCombinerPattern::FNMULSUBH_OP1: 6642 Opc = AArch64::FNMADDHrrr; 6643 RC = &AArch64::FPR16RegClass; 6644 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6645 break; 6646 case MachineCombinerPattern::FNMULSUBS_OP1: 6647 Opc = AArch64::FNMADDSrrr; 6648 RC = &AArch64::FPR32RegClass; 6649 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6650 break; 6651 case MachineCombinerPattern::FNMULSUBD_OP1: 6652 Opc = AArch64::FNMADDDrrr; 6653 RC = &AArch64::FPR64RegClass; 6654 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6655 break; 6656 6657 case MachineCombinerPattern::FMULSUBH_OP2: 6658 Opc = AArch64::FMSUBHrrr; 6659 RC = &AArch64::FPR16RegClass; 6660 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6661 break; 6662 case MachineCombinerPattern::FMULSUBS_OP2: 6663 Opc = AArch64::FMSUBSrrr; 6664 RC = &AArch64::FPR32RegClass; 6665 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6666 break; 6667 case MachineCombinerPattern::FMULSUBD_OP2: 6668 Opc = AArch64::FMSUBDrrr; 6669 RC = &AArch64::FPR64RegClass; 6670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6671 break; 6672 6673 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6674 Opc = AArch64::FMLSv1i32_indexed; 6675 RC = &AArch64::FPR32RegClass; 6676 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6677 FMAInstKind::Indexed); 6678 break; 6679 6680 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6681 Opc = AArch64::FMLSv1i64_indexed; 6682 RC = &AArch64::FPR64RegClass; 6683 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6684 FMAInstKind::Indexed); 6685 break; 6686 6687 case MachineCombinerPattern::FMLSv4f16_OP1: 6688 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 6689 RC = &AArch64::FPR64RegClass; 6690 Register NewVR = MRI.createVirtualRegister(RC); 6691 MachineInstrBuilder MIB1 = 6692 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 6693 .add(Root.getOperand(2)); 6694 InsInstrs.push_back(MIB1); 6695 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6696 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 6697 Opc = AArch64::FMLAv4f16; 6698 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6699 FMAInstKind::Accumulator, &NewVR); 6700 } else { 6701 Opc = AArch64::FMLAv4i16_indexed; 6702 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6703 FMAInstKind::Indexed, &NewVR); 6704 } 6705 break; 6706 } 6707 case MachineCombinerPattern::FMLSv4f16_OP2: 6708 RC = &AArch64::FPR64RegClass; 6709 Opc = AArch64::FMLSv4f16; 6710 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6711 FMAInstKind::Accumulator); 6712 break; 6713 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6714 RC = &AArch64::FPR64RegClass; 6715 Opc = AArch64::FMLSv4i16_indexed; 6716 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6717 FMAInstKind::Indexed); 6718 break; 6719 6720 case MachineCombinerPattern::FMLSv2f32_OP2: 6721 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6722 RC = &AArch64::FPR64RegClass; 6723 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 6724 Opc = AArch64::FMLSv2i32_indexed; 6725 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6726 FMAInstKind::Indexed); 6727 } else { 6728 Opc = AArch64::FMLSv2f32; 6729 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6730 FMAInstKind::Accumulator); 6731 } 6732 break; 6733 6734 case MachineCombinerPattern::FMLSv8f16_OP1: 6735 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 6736 RC = &AArch64::FPR128RegClass; 6737 Register NewVR = MRI.createVirtualRegister(RC); 6738 MachineInstrBuilder MIB1 = 6739 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 6740 .add(Root.getOperand(2)); 6741 InsInstrs.push_back(MIB1); 6742 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6743 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 6744 Opc = AArch64::FMLAv8f16; 6745 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6746 FMAInstKind::Accumulator, &NewVR); 6747 } else { 6748 Opc = AArch64::FMLAv8i16_indexed; 6749 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6750 FMAInstKind::Indexed, &NewVR); 6751 } 6752 break; 6753 } 6754 case MachineCombinerPattern::FMLSv8f16_OP2: 6755 RC = &AArch64::FPR128RegClass; 6756 Opc = AArch64::FMLSv8f16; 6757 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6758 FMAInstKind::Accumulator); 6759 break; 6760 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6761 RC = &AArch64::FPR128RegClass; 6762 Opc = AArch64::FMLSv8i16_indexed; 6763 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6764 FMAInstKind::Indexed); 6765 break; 6766 6767 case MachineCombinerPattern::FMLSv2f64_OP2: 6768 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6769 RC = &AArch64::FPR128RegClass; 6770 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 6771 Opc = AArch64::FMLSv2i64_indexed; 6772 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6773 FMAInstKind::Indexed); 6774 } else { 6775 Opc = AArch64::FMLSv2f64; 6776 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6777 FMAInstKind::Accumulator); 6778 } 6779 break; 6780 6781 case MachineCombinerPattern::FMLSv4f32_OP2: 6782 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6783 RC = &AArch64::FPR128RegClass; 6784 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 6785 Opc = AArch64::FMLSv4i32_indexed; 6786 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6787 FMAInstKind::Indexed); 6788 } else { 6789 Opc = AArch64::FMLSv4f32; 6790 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6791 FMAInstKind::Accumulator); 6792 } 6793 break; 6794 case MachineCombinerPattern::FMLSv2f32_OP1: 6795 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 6796 RC = &AArch64::FPR64RegClass; 6797 Register NewVR = MRI.createVirtualRegister(RC); 6798 MachineInstrBuilder MIB1 = 6799 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 6800 .add(Root.getOperand(2)); 6801 InsInstrs.push_back(MIB1); 6802 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6803 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 6804 Opc = AArch64::FMLAv2i32_indexed; 6805 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6806 FMAInstKind::Indexed, &NewVR); 6807 } else { 6808 Opc = AArch64::FMLAv2f32; 6809 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6810 FMAInstKind::Accumulator, &NewVR); 6811 } 6812 break; 6813 } 6814 case MachineCombinerPattern::FMLSv4f32_OP1: 6815 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 6816 RC = &AArch64::FPR128RegClass; 6817 Register NewVR = MRI.createVirtualRegister(RC); 6818 MachineInstrBuilder MIB1 = 6819 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 6820 .add(Root.getOperand(2)); 6821 InsInstrs.push_back(MIB1); 6822 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6823 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 6824 Opc = AArch64::FMLAv4i32_indexed; 6825 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6826 FMAInstKind::Indexed, &NewVR); 6827 } else { 6828 Opc = AArch64::FMLAv4f32; 6829 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6830 FMAInstKind::Accumulator, &NewVR); 6831 } 6832 break; 6833 } 6834 case MachineCombinerPattern::FMLSv2f64_OP1: 6835 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6836 RC = &AArch64::FPR128RegClass; 6837 Register NewVR = MRI.createVirtualRegister(RC); 6838 MachineInstrBuilder MIB1 = 6839 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 6840 .add(Root.getOperand(2)); 6841 InsInstrs.push_back(MIB1); 6842 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6843 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6844 Opc = AArch64::FMLAv2i64_indexed; 6845 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6846 FMAInstKind::Indexed, &NewVR); 6847 } else { 6848 Opc = AArch64::FMLAv2f64; 6849 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6850 FMAInstKind::Accumulator, &NewVR); 6851 } 6852 break; 6853 } 6854 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 6855 case MachineCombinerPattern::FMULv2i32_indexed_OP2: { 6856 unsigned IdxDupOp = 6857 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; 6858 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 6859 &AArch64::FPR128RegClass, MRI); 6860 break; 6861 } 6862 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 6863 case MachineCombinerPattern::FMULv2i64_indexed_OP2: { 6864 unsigned IdxDupOp = 6865 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; 6866 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 6867 &AArch64::FPR128RegClass, MRI); 6868 break; 6869 } 6870 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 6871 case MachineCombinerPattern::FMULv4i16_indexed_OP2: { 6872 unsigned IdxDupOp = 6873 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; 6874 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 6875 &AArch64::FPR128_loRegClass, MRI); 6876 break; 6877 } 6878 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 6879 case MachineCombinerPattern::FMULv4i32_indexed_OP2: { 6880 unsigned IdxDupOp = 6881 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; 6882 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 6883 &AArch64::FPR128RegClass, MRI); 6884 break; 6885 } 6886 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 6887 case MachineCombinerPattern::FMULv8i16_indexed_OP2: { 6888 unsigned IdxDupOp = 6889 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; 6890 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 6891 &AArch64::FPR128_loRegClass, MRI); 6892 break; 6893 } 6894 case MachineCombinerPattern::FNMADD: { 6895 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); 6896 break; 6897 } 6898 6899 } // end switch (Pattern) 6900 // Record MUL and ADD/SUB for deletion 6901 if (MUL) 6902 DelInstrs.push_back(MUL); 6903 DelInstrs.push_back(&Root); 6904 6905 // Set the flags on the inserted instructions to be the merged flags of the 6906 // instructions that we have combined. 6907 uint32_t Flags = Root.getFlags(); 6908 if (MUL) 6909 Flags = Root.mergeFlagsWith(*MUL); 6910 for (auto *MI : InsInstrs) 6911 MI->setFlags(Flags); 6912 } 6913 6914 /// Replace csincr-branch sequence by simple conditional branch 6915 /// 6916 /// Examples: 6917 /// 1. \code 6918 /// csinc w9, wzr, wzr, <condition code> 6919 /// tbnz w9, #0, 0x44 6920 /// \endcode 6921 /// to 6922 /// \code 6923 /// b.<inverted condition code> 6924 /// \endcode 6925 /// 6926 /// 2. \code 6927 /// csinc w9, wzr, wzr, <condition code> 6928 /// tbz w9, #0, 0x44 6929 /// \endcode 6930 /// to 6931 /// \code 6932 /// b.<condition code> 6933 /// \endcode 6934 /// 6935 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6936 /// compare's constant operand is power of 2. 6937 /// 6938 /// Examples: 6939 /// \code 6940 /// and w8, w8, #0x400 6941 /// cbnz w8, L1 6942 /// \endcode 6943 /// to 6944 /// \code 6945 /// tbnz w8, #10, L1 6946 /// \endcode 6947 /// 6948 /// \param MI Conditional Branch 6949 /// \return True when the simple conditional branch is generated 6950 /// 6951 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6952 bool IsNegativeBranch = false; 6953 bool IsTestAndBranch = false; 6954 unsigned TargetBBInMI = 0; 6955 switch (MI.getOpcode()) { 6956 default: 6957 llvm_unreachable("Unknown branch instruction?"); 6958 case AArch64::Bcc: 6959 return false; 6960 case AArch64::CBZW: 6961 case AArch64::CBZX: 6962 TargetBBInMI = 1; 6963 break; 6964 case AArch64::CBNZW: 6965 case AArch64::CBNZX: 6966 TargetBBInMI = 1; 6967 IsNegativeBranch = true; 6968 break; 6969 case AArch64::TBZW: 6970 case AArch64::TBZX: 6971 TargetBBInMI = 2; 6972 IsTestAndBranch = true; 6973 break; 6974 case AArch64::TBNZW: 6975 case AArch64::TBNZX: 6976 TargetBBInMI = 2; 6977 IsNegativeBranch = true; 6978 IsTestAndBranch = true; 6979 break; 6980 } 6981 // So we increment a zero register and test for bits other 6982 // than bit 0? Conservatively bail out in case the verifier 6983 // missed this case. 6984 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6985 return false; 6986 6987 // Find Definition. 6988 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6989 MachineBasicBlock *MBB = MI.getParent(); 6990 MachineFunction *MF = MBB->getParent(); 6991 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6992 Register VReg = MI.getOperand(0).getReg(); 6993 if (!VReg.isVirtual()) 6994 return false; 6995 6996 MachineInstr *DefMI = MRI->getVRegDef(VReg); 6997 6998 // Look through COPY instructions to find definition. 6999 while (DefMI->isCopy()) { 7000 Register CopyVReg = DefMI->getOperand(1).getReg(); 7001 if (!MRI->hasOneNonDBGUse(CopyVReg)) 7002 return false; 7003 if (!MRI->hasOneDef(CopyVReg)) 7004 return false; 7005 DefMI = MRI->getVRegDef(CopyVReg); 7006 } 7007 7008 switch (DefMI->getOpcode()) { 7009 default: 7010 return false; 7011 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 7012 case AArch64::ANDWri: 7013 case AArch64::ANDXri: { 7014 if (IsTestAndBranch) 7015 return false; 7016 if (DefMI->getParent() != MBB) 7017 return false; 7018 if (!MRI->hasOneNonDBGUse(VReg)) 7019 return false; 7020 7021 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 7022 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 7023 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 7024 if (!isPowerOf2_64(Mask)) 7025 return false; 7026 7027 MachineOperand &MO = DefMI->getOperand(1); 7028 Register NewReg = MO.getReg(); 7029 if (!NewReg.isVirtual()) 7030 return false; 7031 7032 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 7033 7034 MachineBasicBlock &RefToMBB = *MBB; 7035 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 7036 DebugLoc DL = MI.getDebugLoc(); 7037 unsigned Imm = Log2_64(Mask); 7038 unsigned Opc = (Imm < 32) 7039 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 7040 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 7041 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 7042 .addReg(NewReg) 7043 .addImm(Imm) 7044 .addMBB(TBB); 7045 // Register lives on to the CBZ now. 7046 MO.setIsKill(false); 7047 7048 // For immediate smaller than 32, we need to use the 32-bit 7049 // variant (W) in all cases. Indeed the 64-bit variant does not 7050 // allow to encode them. 7051 // Therefore, if the input register is 64-bit, we need to take the 7052 // 32-bit sub-part. 7053 if (!Is32Bit && Imm < 32) 7054 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 7055 MI.eraseFromParent(); 7056 return true; 7057 } 7058 // Look for CSINC 7059 case AArch64::CSINCWr: 7060 case AArch64::CSINCXr: { 7061 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 7062 DefMI->getOperand(2).getReg() == AArch64::WZR) && 7063 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 7064 DefMI->getOperand(2).getReg() == AArch64::XZR)) 7065 return false; 7066 7067 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 7068 return false; 7069 7070 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 7071 // Convert only when the condition code is not modified between 7072 // the CSINC and the branch. The CC may be used by other 7073 // instructions in between. 7074 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 7075 return false; 7076 MachineBasicBlock &RefToMBB = *MBB; 7077 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 7078 DebugLoc DL = MI.getDebugLoc(); 7079 if (IsNegativeBranch) 7080 CC = AArch64CC::getInvertedCondCode(CC); 7081 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 7082 MI.eraseFromParent(); 7083 return true; 7084 } 7085 } 7086 } 7087 7088 std::pair<unsigned, unsigned> 7089 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 7090 const unsigned Mask = AArch64II::MO_FRAGMENT; 7091 return std::make_pair(TF & Mask, TF & ~Mask); 7092 } 7093 7094 ArrayRef<std::pair<unsigned, const char *>> 7095 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 7096 using namespace AArch64II; 7097 7098 static const std::pair<unsigned, const char *> TargetFlags[] = { 7099 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 7100 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 7101 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 7102 {MO_HI12, "aarch64-hi12"}}; 7103 return ArrayRef(TargetFlags); 7104 } 7105 7106 ArrayRef<std::pair<unsigned, const char *>> 7107 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 7108 using namespace AArch64II; 7109 7110 static const std::pair<unsigned, const char *> TargetFlags[] = { 7111 {MO_COFFSTUB, "aarch64-coffstub"}, 7112 {MO_GOT, "aarch64-got"}, 7113 {MO_NC, "aarch64-nc"}, 7114 {MO_S, "aarch64-s"}, 7115 {MO_TLS, "aarch64-tls"}, 7116 {MO_DLLIMPORT, "aarch64-dllimport"}, 7117 {MO_DLLIMPORTAUX, "aarch64-dllimportaux"}, 7118 {MO_PREL, "aarch64-prel"}, 7119 {MO_TAGGED, "aarch64-tagged"}}; 7120 return ArrayRef(TargetFlags); 7121 } 7122 7123 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 7124 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 7125 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 7126 {{MOSuppressPair, "aarch64-suppress-pair"}, 7127 {MOStridedAccess, "aarch64-strided-access"}}; 7128 return ArrayRef(TargetFlags); 7129 } 7130 7131 /// Constants defining how certain sequences should be outlined. 7132 /// This encompasses how an outlined function should be called, and what kind of 7133 /// frame should be emitted for that outlined function. 7134 /// 7135 /// \p MachineOutlinerDefault implies that the function should be called with 7136 /// a save and restore of LR to the stack. 7137 /// 7138 /// That is, 7139 /// 7140 /// I1 Save LR OUTLINED_FUNCTION: 7141 /// I2 --> BL OUTLINED_FUNCTION I1 7142 /// I3 Restore LR I2 7143 /// I3 7144 /// RET 7145 /// 7146 /// * Call construction overhead: 3 (save + BL + restore) 7147 /// * Frame construction overhead: 1 (ret) 7148 /// * Requires stack fixups? Yes 7149 /// 7150 /// \p MachineOutlinerTailCall implies that the function is being created from 7151 /// a sequence of instructions ending in a return. 7152 /// 7153 /// That is, 7154 /// 7155 /// I1 OUTLINED_FUNCTION: 7156 /// I2 --> B OUTLINED_FUNCTION I1 7157 /// RET I2 7158 /// RET 7159 /// 7160 /// * Call construction overhead: 1 (B) 7161 /// * Frame construction overhead: 0 (Return included in sequence) 7162 /// * Requires stack fixups? No 7163 /// 7164 /// \p MachineOutlinerNoLRSave implies that the function should be called using 7165 /// a BL instruction, but doesn't require LR to be saved and restored. This 7166 /// happens when LR is known to be dead. 7167 /// 7168 /// That is, 7169 /// 7170 /// I1 OUTLINED_FUNCTION: 7171 /// I2 --> BL OUTLINED_FUNCTION I1 7172 /// I3 I2 7173 /// I3 7174 /// RET 7175 /// 7176 /// * Call construction overhead: 1 (BL) 7177 /// * Frame construction overhead: 1 (RET) 7178 /// * Requires stack fixups? No 7179 /// 7180 /// \p MachineOutlinerThunk implies that the function is being created from 7181 /// a sequence of instructions ending in a call. The outlined function is 7182 /// called with a BL instruction, and the outlined function tail-calls the 7183 /// original call destination. 7184 /// 7185 /// That is, 7186 /// 7187 /// I1 OUTLINED_FUNCTION: 7188 /// I2 --> BL OUTLINED_FUNCTION I1 7189 /// BL f I2 7190 /// B f 7191 /// * Call construction overhead: 1 (BL) 7192 /// * Frame construction overhead: 0 7193 /// * Requires stack fixups? No 7194 /// 7195 /// \p MachineOutlinerRegSave implies that the function should be called with a 7196 /// save and restore of LR to an available register. This allows us to avoid 7197 /// stack fixups. Note that this outlining variant is compatible with the 7198 /// NoLRSave case. 7199 /// 7200 /// That is, 7201 /// 7202 /// I1 Save LR OUTLINED_FUNCTION: 7203 /// I2 --> BL OUTLINED_FUNCTION I1 7204 /// I3 Restore LR I2 7205 /// I3 7206 /// RET 7207 /// 7208 /// * Call construction overhead: 3 (save + BL + restore) 7209 /// * Frame construction overhead: 1 (ret) 7210 /// * Requires stack fixups? No 7211 enum MachineOutlinerClass { 7212 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 7213 MachineOutlinerTailCall, /// Only emit a branch. 7214 MachineOutlinerNoLRSave, /// Emit a call and return. 7215 MachineOutlinerThunk, /// Emit a call and tail-call. 7216 MachineOutlinerRegSave /// Same as default, but save to a register. 7217 }; 7218 7219 enum MachineOutlinerMBBFlags { 7220 LRUnavailableSomewhere = 0x2, 7221 HasCalls = 0x4, 7222 UnsafeRegsDead = 0x8 7223 }; 7224 7225 Register 7226 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 7227 MachineFunction *MF = C.getMF(); 7228 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 7229 const AArch64RegisterInfo *ARI = 7230 static_cast<const AArch64RegisterInfo *>(&TRI); 7231 // Check if there is an available register across the sequence that we can 7232 // use. 7233 for (unsigned Reg : AArch64::GPR64RegClass) { 7234 if (!ARI->isReservedReg(*MF, Reg) && 7235 Reg != AArch64::LR && // LR is not reserved, but don't use it. 7236 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 7237 Reg != AArch64::X17 && // Ditto for X17. 7238 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 7239 C.isAvailableInsideSeq(Reg, TRI)) 7240 return Reg; 7241 } 7242 return Register(); 7243 } 7244 7245 static bool 7246 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 7247 const outliner::Candidate &b) { 7248 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7249 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7250 7251 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 7252 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 7253 } 7254 7255 static bool 7256 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 7257 const outliner::Candidate &b) { 7258 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7259 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7260 7261 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 7262 } 7263 7264 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 7265 const outliner::Candidate &b) { 7266 const AArch64Subtarget &SubtargetA = 7267 a.getMF()->getSubtarget<AArch64Subtarget>(); 7268 const AArch64Subtarget &SubtargetB = 7269 b.getMF()->getSubtarget<AArch64Subtarget>(); 7270 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 7271 } 7272 7273 std::optional<outliner::OutlinedFunction> 7274 AArch64InstrInfo::getOutliningCandidateInfo( 7275 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 7276 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 7277 unsigned SequenceSize = 7278 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 7279 [this](unsigned Sum, const MachineInstr &MI) { 7280 return Sum + getInstSizeInBytes(MI); 7281 }); 7282 unsigned NumBytesToCreateFrame = 0; 7283 7284 // We only allow outlining for functions having exactly matching return 7285 // address signing attributes, i.e., all share the same value for the 7286 // attribute "sign-return-address" and all share the same type of key they 7287 // are signed with. 7288 // Additionally we require all functions to simultaniously either support 7289 // v8.3a features or not. Otherwise an outlined function could get signed 7290 // using dedicated v8.3 instructions and a call from a function that doesn't 7291 // support v8.3 instructions would therefore be invalid. 7292 if (std::adjacent_find( 7293 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 7294 [](const outliner::Candidate &a, const outliner::Candidate &b) { 7295 // Return true if a and b are non-equal w.r.t. return address 7296 // signing or support of v8.3a features 7297 if (outliningCandidatesSigningScopeConsensus(a, b) && 7298 outliningCandidatesSigningKeyConsensus(a, b) && 7299 outliningCandidatesV8_3OpsConsensus(a, b)) { 7300 return false; 7301 } 7302 return true; 7303 }) != RepeatedSequenceLocs.end()) { 7304 return std::nullopt; 7305 } 7306 7307 // Since at this point all candidates agree on their return address signing 7308 // picking just one is fine. If the candidate functions potentially sign their 7309 // return addresses, the outlined function should do the same. Note that in 7310 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 7311 // not certainly true that the outlined function will have to sign its return 7312 // address but this decision is made later, when the decision to outline 7313 // has already been made. 7314 // The same holds for the number of additional instructions we need: On 7315 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 7316 // necessary. However, at this point we don't know if the outlined function 7317 // will have a RET instruction so we assume the worst. 7318 const TargetRegisterInfo &TRI = getRegisterInfo(); 7319 if (FirstCand.getMF() 7320 ->getInfo<AArch64FunctionInfo>() 7321 ->shouldSignReturnAddress(true)) { 7322 // One PAC and one AUT instructions 7323 NumBytesToCreateFrame += 8; 7324 7325 // We have to check if sp modifying instructions would get outlined. 7326 // If so we only allow outlining if sp is unchanged overall, so matching 7327 // sub and add instructions are okay to outline, all other sp modifications 7328 // are not 7329 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 7330 int SPValue = 0; 7331 MachineBasicBlock::iterator MBBI = C.front(); 7332 for (;;) { 7333 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 7334 switch (MBBI->getOpcode()) { 7335 case AArch64::ADDXri: 7336 case AArch64::ADDWri: 7337 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7338 assert(MBBI->getOperand(2).isImm() && 7339 "Expected operand to be immediate"); 7340 assert(MBBI->getOperand(1).isReg() && 7341 "Expected operand to be a register"); 7342 // Check if the add just increments sp. If so, we search for 7343 // matching sub instructions that decrement sp. If not, the 7344 // modification is illegal 7345 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7346 SPValue += MBBI->getOperand(2).getImm(); 7347 else 7348 return true; 7349 break; 7350 case AArch64::SUBXri: 7351 case AArch64::SUBWri: 7352 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7353 assert(MBBI->getOperand(2).isImm() && 7354 "Expected operand to be immediate"); 7355 assert(MBBI->getOperand(1).isReg() && 7356 "Expected operand to be a register"); 7357 // Check if the sub just decrements sp. If so, we search for 7358 // matching add instructions that increment sp. If not, the 7359 // modification is illegal 7360 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7361 SPValue -= MBBI->getOperand(2).getImm(); 7362 else 7363 return true; 7364 break; 7365 default: 7366 return true; 7367 } 7368 } 7369 if (MBBI == C.back()) 7370 break; 7371 ++MBBI; 7372 } 7373 if (SPValue) 7374 return true; 7375 return false; 7376 }; 7377 // Remove candidates with illegal stack modifying instructions 7378 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 7379 7380 // If the sequence doesn't have enough candidates left, then we're done. 7381 if (RepeatedSequenceLocs.size() < 2) 7382 return std::nullopt; 7383 } 7384 7385 // Properties about candidate MBBs that hold for all of them. 7386 unsigned FlagsSetInAll = 0xF; 7387 7388 // Compute liveness information for each candidate, and set FlagsSetInAll. 7389 for (outliner::Candidate &C : RepeatedSequenceLocs) 7390 FlagsSetInAll &= C.Flags; 7391 7392 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 7393 7394 // Helper lambda which sets call information for every candidate. 7395 auto SetCandidateCallInfo = 7396 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 7397 for (outliner::Candidate &C : RepeatedSequenceLocs) 7398 C.setCallInfo(CallID, NumBytesForCall); 7399 }; 7400 7401 unsigned FrameID = MachineOutlinerDefault; 7402 NumBytesToCreateFrame += 4; 7403 7404 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 7405 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 7406 }); 7407 7408 // We check to see if CFI Instructions are present, and if they are 7409 // we find the number of CFI Instructions in the candidates. 7410 unsigned CFICount = 0; 7411 for (auto &I : make_range(RepeatedSequenceLocs[0].front(), 7412 std::next(RepeatedSequenceLocs[0].back()))) { 7413 if (I.isCFIInstruction()) 7414 CFICount++; 7415 } 7416 7417 // We compare the number of found CFI Instructions to the number of CFI 7418 // instructions in the parent function for each candidate. We must check this 7419 // since if we outline one of the CFI instructions in a function, we have to 7420 // outline them all for correctness. If we do not, the address offsets will be 7421 // incorrect between the two sections of the program. 7422 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7423 std::vector<MCCFIInstruction> CFIInstructions = 7424 C.getMF()->getFrameInstructions(); 7425 7426 if (CFICount > 0 && CFICount != CFIInstructions.size()) 7427 return std::nullopt; 7428 } 7429 7430 // Returns true if an instructions is safe to fix up, false otherwise. 7431 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 7432 if (MI.isCall()) 7433 return true; 7434 7435 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 7436 !MI.readsRegister(AArch64::SP, &TRI)) 7437 return true; 7438 7439 // Any modification of SP will break our code to save/restore LR. 7440 // FIXME: We could handle some instructions which add a constant 7441 // offset to SP, with a bit more work. 7442 if (MI.modifiesRegister(AArch64::SP, &TRI)) 7443 return false; 7444 7445 // At this point, we have a stack instruction that we might need to 7446 // fix up. We'll handle it if it's a load or store. 7447 if (MI.mayLoadOrStore()) { 7448 const MachineOperand *Base; // Filled with the base operand of MI. 7449 int64_t Offset; // Filled with the offset of MI. 7450 bool OffsetIsScalable; 7451 7452 // Does it allow us to offset the base operand and is the base the 7453 // register SP? 7454 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 7455 !Base->isReg() || Base->getReg() != AArch64::SP) 7456 return false; 7457 7458 // Fixe-up code below assumes bytes. 7459 if (OffsetIsScalable) 7460 return false; 7461 7462 // Find the minimum/maximum offset for this instruction and check 7463 // if fixing it up would be in range. 7464 int64_t MinOffset, 7465 MaxOffset; // Unscaled offsets for the instruction. 7466 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 7467 unsigned DummyWidth; 7468 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 7469 7470 Offset += 16; // Update the offset to what it would be if we outlined. 7471 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 7472 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 7473 return false; 7474 7475 // It's in range, so we can outline it. 7476 return true; 7477 } 7478 7479 // FIXME: Add handling for instructions like "add x0, sp, #8". 7480 7481 // We can't fix it up, so don't outline it. 7482 return false; 7483 }; 7484 7485 // True if it's possible to fix up each stack instruction in this sequence. 7486 // Important for frames/call variants that modify the stack. 7487 bool AllStackInstrsSafe = std::all_of( 7488 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 7489 7490 // If the last instruction in any candidate is a terminator, then we should 7491 // tail call all of the candidates. 7492 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 7493 FrameID = MachineOutlinerTailCall; 7494 NumBytesToCreateFrame = 0; 7495 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 7496 } 7497 7498 else if (LastInstrOpcode == AArch64::BL || 7499 ((LastInstrOpcode == AArch64::BLR || 7500 LastInstrOpcode == AArch64::BLRNoIP) && 7501 !HasBTI)) { 7502 // FIXME: Do we need to check if the code after this uses the value of LR? 7503 FrameID = MachineOutlinerThunk; 7504 NumBytesToCreateFrame = 0; 7505 SetCandidateCallInfo(MachineOutlinerThunk, 4); 7506 } 7507 7508 else { 7509 // We need to decide how to emit calls + frames. We can always emit the same 7510 // frame if we don't need to save to the stack. If we have to save to the 7511 // stack, then we need a different frame. 7512 unsigned NumBytesNoStackCalls = 0; 7513 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 7514 7515 // Check if we have to save LR. 7516 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7517 bool LRAvailable = 7518 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere) 7519 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) 7520 : true; 7521 // If we have a noreturn caller, then we're going to be conservative and 7522 // say that we have to save LR. If we don't have a ret at the end of the 7523 // block, then we can't reason about liveness accurately. 7524 // 7525 // FIXME: We can probably do better than always disabling this in 7526 // noreturn functions by fixing up the liveness info. 7527 bool IsNoReturn = 7528 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 7529 7530 // Is LR available? If so, we don't need a save. 7531 if (LRAvailable && !IsNoReturn) { 7532 NumBytesNoStackCalls += 4; 7533 C.setCallInfo(MachineOutlinerNoLRSave, 4); 7534 CandidatesWithoutStackFixups.push_back(C); 7535 } 7536 7537 // Is an unused register available? If so, we won't modify the stack, so 7538 // we can outline with the same frame type as those that don't save LR. 7539 else if (findRegisterToSaveLRTo(C)) { 7540 NumBytesNoStackCalls += 12; 7541 C.setCallInfo(MachineOutlinerRegSave, 12); 7542 CandidatesWithoutStackFixups.push_back(C); 7543 } 7544 7545 // Is SP used in the sequence at all? If not, we don't have to modify 7546 // the stack, so we are guaranteed to get the same frame. 7547 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 7548 NumBytesNoStackCalls += 12; 7549 C.setCallInfo(MachineOutlinerDefault, 12); 7550 CandidatesWithoutStackFixups.push_back(C); 7551 } 7552 7553 // If we outline this, we need to modify the stack. Pretend we don't 7554 // outline this by saving all of its bytes. 7555 else { 7556 NumBytesNoStackCalls += SequenceSize; 7557 } 7558 } 7559 7560 // If there are no places where we have to save LR, then note that we 7561 // don't have to update the stack. Otherwise, give every candidate the 7562 // default call type, as long as it's safe to do so. 7563 if (!AllStackInstrsSafe || 7564 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 7565 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 7566 FrameID = MachineOutlinerNoLRSave; 7567 } else { 7568 SetCandidateCallInfo(MachineOutlinerDefault, 12); 7569 7570 // Bugzilla ID: 46767 7571 // TODO: Check if fixing up the stack more than once is safe so we can 7572 // outline these. 7573 // 7574 // An outline resulting in a caller that requires stack fixups at the 7575 // callsite to a callee that also requires stack fixups can happen when 7576 // there are no available registers at the candidate callsite for a 7577 // candidate that itself also has calls. 7578 // 7579 // In other words if function_containing_sequence in the following pseudo 7580 // assembly requires that we save LR at the point of the call, but there 7581 // are no available registers: in this case we save using SP and as a 7582 // result the SP offsets requires stack fixups by multiples of 16. 7583 // 7584 // function_containing_sequence: 7585 // ... 7586 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7587 // call OUTLINED_FUNCTION_N 7588 // restore LR from SP 7589 // ... 7590 // 7591 // OUTLINED_FUNCTION_N: 7592 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7593 // ... 7594 // bl foo 7595 // restore LR from SP 7596 // ret 7597 // 7598 // Because the code to handle more than one stack fixup does not 7599 // currently have the proper checks for legality, these cases will assert 7600 // in the AArch64 MachineOutliner. This is because the code to do this 7601 // needs more hardening, testing, better checks that generated code is 7602 // legal, etc and because it is only verified to handle a single pass of 7603 // stack fixup. 7604 // 7605 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 7606 // these cases until they are known to be handled. Bugzilla 46767 is 7607 // referenced in comments at the assert site. 7608 // 7609 // To avoid asserting (or generating non-legal code on noassert builds) 7610 // we remove all candidates which would need more than one stack fixup by 7611 // pruning the cases where the candidate has calls while also having no 7612 // available LR and having no available general purpose registers to copy 7613 // LR to (ie one extra stack save/restore). 7614 // 7615 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7616 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 7617 return (std::any_of( 7618 C.front(), std::next(C.back()), 7619 [](const MachineInstr &MI) { return MI.isCall(); })) && 7620 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 7621 !findRegisterToSaveLRTo(C)); 7622 }); 7623 } 7624 } 7625 7626 // If we dropped all of the candidates, bail out here. 7627 if (RepeatedSequenceLocs.size() < 2) { 7628 RepeatedSequenceLocs.clear(); 7629 return std::nullopt; 7630 } 7631 } 7632 7633 // Does every candidate's MBB contain a call? If so, then we might have a call 7634 // in the range. 7635 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7636 // Check if the range contains a call. These require a save + restore of the 7637 // link register. 7638 bool ModStackToSaveLR = false; 7639 if (std::any_of(FirstCand.front(), FirstCand.back(), 7640 [](const MachineInstr &MI) { return MI.isCall(); })) 7641 ModStackToSaveLR = true; 7642 7643 // Handle the last instruction separately. If this is a tail call, then the 7644 // last instruction is a call. We don't want to save + restore in this case. 7645 // However, it could be possible that the last instruction is a call without 7646 // it being valid to tail call this sequence. We should consider this as 7647 // well. 7648 else if (FrameID != MachineOutlinerThunk && 7649 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 7650 ModStackToSaveLR = true; 7651 7652 if (ModStackToSaveLR) { 7653 // We can't fix up the stack. Bail out. 7654 if (!AllStackInstrsSafe) { 7655 RepeatedSequenceLocs.clear(); 7656 return std::nullopt; 7657 } 7658 7659 // Save + restore LR. 7660 NumBytesToCreateFrame += 8; 7661 } 7662 } 7663 7664 // If we have CFI instructions, we can only outline if the outlined section 7665 // can be a tail call 7666 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 7667 return std::nullopt; 7668 7669 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 7670 NumBytesToCreateFrame, FrameID); 7671 } 7672 7673 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 7674 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 7675 const Function &F = MF.getFunction(); 7676 7677 // Can F be deduplicated by the linker? If it can, don't outline from it. 7678 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 7679 return false; 7680 7681 // Don't outline from functions with section markings; the program could 7682 // expect that all the code is in the named section. 7683 // FIXME: Allow outlining from multiple functions with the same section 7684 // marking. 7685 if (F.hasSection()) 7686 return false; 7687 7688 // Outlining from functions with redzones is unsafe since the outliner may 7689 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 7690 // outline from it. 7691 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 7692 if (!AFI || AFI->hasRedZone().value_or(true)) 7693 return false; 7694 7695 // FIXME: Teach the outliner to generate/handle Windows unwind info. 7696 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 7697 return false; 7698 7699 // It's safe to outline from MF. 7700 return true; 7701 } 7702 7703 SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 7704 AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, 7705 unsigned &Flags) const { 7706 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 7707 "Must track liveness!"); 7708 SmallVector< 7709 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> 7710 Ranges; 7711 // According to the AArch64 Procedure Call Standard, the following are 7712 // undefined on entry/exit from a function call: 7713 // 7714 // * Registers x16, x17, (and thus w16, w17) 7715 // * Condition codes (and thus the NZCV register) 7716 // 7717 // If any of these registers are used inside or live across an outlined 7718 // function, then they may be modified later, either by the compiler or 7719 // some other tool (like the linker). 7720 // 7721 // To avoid outlining in these situations, partition each block into ranges 7722 // where these registers are dead. We will only outline from those ranges. 7723 LiveRegUnits LRU(getRegisterInfo()); 7724 auto AreAllUnsafeRegsDead = [&LRU]() { 7725 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) && 7726 LRU.available(AArch64::NZCV); 7727 }; 7728 7729 // We need to know if LR is live across an outlining boundary later on in 7730 // order to decide how we'll create the outlined call, frame, etc. 7731 // 7732 // It's pretty expensive to check this for *every candidate* within a block. 7733 // That's some potentially n^2 behaviour, since in the worst case, we'd need 7734 // to compute liveness from the end of the block for O(n) candidates within 7735 // the block. 7736 // 7737 // So, to improve the average case, let's keep track of liveness from the end 7738 // of the block to the beginning of *every outlinable range*. If we know that 7739 // LR is available in every range we could outline from, then we know that 7740 // we don't need to check liveness for any candidate within that range. 7741 bool LRAvailableEverywhere = true; 7742 // Compute liveness bottom-up. 7743 LRU.addLiveOuts(MBB); 7744 // Update flags that require info about the entire MBB. 7745 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) { 7746 if (MI.isCall() && !MI.isTerminator()) 7747 Flags |= MachineOutlinerMBBFlags::HasCalls; 7748 }; 7749 // Range: [RangeBegin, RangeEnd) 7750 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd; 7751 unsigned RangeLen; 7752 auto CreateNewRangeStartingAt = 7753 [&RangeBegin, &RangeEnd, 7754 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) { 7755 RangeBegin = NewBegin; 7756 RangeEnd = std::next(RangeBegin); 7757 RangeLen = 0; 7758 }; 7759 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { 7760 // At least one unsafe register is not dead. We do not want to outline at 7761 // this point. If it is long enough to outline from, save the range 7762 // [RangeBegin, RangeEnd). 7763 if (RangeLen > 1) 7764 Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); 7765 }; 7766 // Find the first point where all unsafe registers are dead. 7767 // FIND: <safe instr> <-- end of first potential range 7768 // SKIP: <unsafe def> 7769 // SKIP: ... everything between ... 7770 // SKIP: <unsafe use> 7771 auto FirstPossibleEndPt = MBB.instr_rbegin(); 7772 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) { 7773 LRU.stepBackward(*FirstPossibleEndPt); 7774 // Update flags that impact how we outline across the entire block, 7775 // regardless of safety. 7776 UpdateWholeMBBFlags(*FirstPossibleEndPt); 7777 if (AreAllUnsafeRegsDead()) 7778 break; 7779 } 7780 // If we exhausted the entire block, we have no safe ranges to outline. 7781 if (FirstPossibleEndPt == MBB.instr_rend()) 7782 return Ranges; 7783 // Current range. 7784 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator()); 7785 // StartPt points to the first place where all unsafe registers 7786 // are dead (if there is any such point). Begin partitioning the MBB into 7787 // ranges. 7788 for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) { 7789 LRU.stepBackward(MI); 7790 UpdateWholeMBBFlags(MI); 7791 if (!AreAllUnsafeRegsDead()) { 7792 SaveRangeIfNonEmpty(); 7793 CreateNewRangeStartingAt(MI.getIterator()); 7794 continue; 7795 } 7796 LRAvailableEverywhere &= LRU.available(AArch64::LR); 7797 RangeBegin = MI.getIterator(); 7798 ++RangeLen; 7799 } 7800 // Above loop misses the last (or only) range. If we are still safe, then 7801 // let's save the range. 7802 if (AreAllUnsafeRegsDead()) 7803 SaveRangeIfNonEmpty(); 7804 if (Ranges.empty()) 7805 return Ranges; 7806 // We found the ranges bottom-up. Mapping expects the top-down. Reverse 7807 // the order. 7808 std::reverse(Ranges.begin(), Ranges.end()); 7809 // If there is at least one outlinable range where LR is unavailable 7810 // somewhere, remember that. 7811 if (!LRAvailableEverywhere) 7812 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 7813 return Ranges; 7814 } 7815 7816 outliner::InstrType 7817 AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, 7818 unsigned Flags) const { 7819 MachineInstr &MI = *MIT; 7820 MachineBasicBlock *MBB = MI.getParent(); 7821 MachineFunction *MF = MBB->getParent(); 7822 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 7823 7824 // Don't outline anything used for return address signing. The outlined 7825 // function will get signed later if needed 7826 switch (MI.getOpcode()) { 7827 case AArch64::PACIASP: 7828 case AArch64::PACIBSP: 7829 case AArch64::AUTIASP: 7830 case AArch64::AUTIBSP: 7831 case AArch64::RETAA: 7832 case AArch64::RETAB: 7833 case AArch64::EMITBKEY: 7834 return outliner::InstrType::Illegal; 7835 } 7836 7837 // Don't outline LOHs. 7838 if (FuncInfo->getLOHRelated().count(&MI)) 7839 return outliner::InstrType::Illegal; 7840 7841 // We can only outline these if we will tail call the outlined function, or 7842 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 7843 // in a tail call. 7844 // 7845 // FIXME: If the proper fixups for the offset are implemented, this should be 7846 // possible. 7847 if (MI.isCFIInstruction()) 7848 return outliner::InstrType::Legal; 7849 7850 // Is this a terminator for a basic block? 7851 if (MI.isTerminator()) 7852 // TargetInstrInfo::getOutliningType has already filtered out anything 7853 // that would break this, so we can allow it here. 7854 return outliner::InstrType::Legal; 7855 7856 // Make sure none of the operands are un-outlinable. 7857 for (const MachineOperand &MOP : MI.operands()) { 7858 // A check preventing CFI indices was here before, but only CFI 7859 // instructions should have those. 7860 assert(!MOP.isCFIIndex()); 7861 7862 // If it uses LR or W30 explicitly, then don't touch it. 7863 if (MOP.isReg() && !MOP.isImplicit() && 7864 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 7865 return outliner::InstrType::Illegal; 7866 } 7867 7868 // Special cases for instructions that can always be outlined, but will fail 7869 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 7870 // be outlined because they don't require a *specific* value to be in LR. 7871 if (MI.getOpcode() == AArch64::ADRP) 7872 return outliner::InstrType::Legal; 7873 7874 // If MI is a call we might be able to outline it. We don't want to outline 7875 // any calls that rely on the position of items on the stack. When we outline 7876 // something containing a call, we have to emit a save and restore of LR in 7877 // the outlined function. Currently, this always happens by saving LR to the 7878 // stack. Thus, if we outline, say, half the parameters for a function call 7879 // plus the call, then we'll break the callee's expectations for the layout 7880 // of the stack. 7881 // 7882 // FIXME: Allow calls to functions which construct a stack frame, as long 7883 // as they don't access arguments on the stack. 7884 // FIXME: Figure out some way to analyze functions defined in other modules. 7885 // We should be able to compute the memory usage based on the IR calling 7886 // convention, even if we can't see the definition. 7887 if (MI.isCall()) { 7888 // Get the function associated with the call. Look at each operand and find 7889 // the one that represents the callee and get its name. 7890 const Function *Callee = nullptr; 7891 for (const MachineOperand &MOP : MI.operands()) { 7892 if (MOP.isGlobal()) { 7893 Callee = dyn_cast<Function>(MOP.getGlobal()); 7894 break; 7895 } 7896 } 7897 7898 // Never outline calls to mcount. There isn't any rule that would require 7899 // this, but the Linux kernel's "ftrace" feature depends on it. 7900 if (Callee && Callee->getName() == "\01_mcount") 7901 return outliner::InstrType::Illegal; 7902 7903 // If we don't know anything about the callee, assume it depends on the 7904 // stack layout of the caller. In that case, it's only legal to outline 7905 // as a tail-call. Explicitly list the call instructions we know about so we 7906 // don't get unexpected results with call pseudo-instructions. 7907 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7908 if (MI.getOpcode() == AArch64::BLR || 7909 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7910 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7911 7912 if (!Callee) 7913 return UnknownCallOutlineType; 7914 7915 // We have a function we have information about. Check it if it's something 7916 // can safely outline. 7917 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7918 7919 // We don't know what's going on with the callee at all. Don't touch it. 7920 if (!CalleeMF) 7921 return UnknownCallOutlineType; 7922 7923 // Check if we know anything about the callee saves on the function. If we 7924 // don't, then don't touch it, since that implies that we haven't 7925 // computed anything about its stack frame yet. 7926 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7927 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7928 MFI.getNumObjects() > 0) 7929 return UnknownCallOutlineType; 7930 7931 // At this point, we can say that CalleeMF ought to not pass anything on the 7932 // stack. Therefore, we can outline it. 7933 return outliner::InstrType::Legal; 7934 } 7935 7936 // Don't touch the link register or W30. 7937 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7938 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7939 return outliner::InstrType::Illegal; 7940 7941 // Don't outline BTI instructions, because that will prevent the outlining 7942 // site from being indirectly callable. 7943 if (MI.getOpcode() == AArch64::HINT) { 7944 int64_t Imm = MI.getOperand(0).getImm(); 7945 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7946 return outliner::InstrType::Illegal; 7947 } 7948 7949 return outliner::InstrType::Legal; 7950 } 7951 7952 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7953 for (MachineInstr &MI : MBB) { 7954 const MachineOperand *Base; 7955 unsigned Width; 7956 int64_t Offset; 7957 bool OffsetIsScalable; 7958 7959 // Is this a load or store with an immediate offset with SP as the base? 7960 if (!MI.mayLoadOrStore() || 7961 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7962 &RI) || 7963 (Base->isReg() && Base->getReg() != AArch64::SP)) 7964 continue; 7965 7966 // It is, so we have to fix it up. 7967 TypeSize Scale(0U, false); 7968 int64_t Dummy1, Dummy2; 7969 7970 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7971 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7972 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7973 assert(Scale != 0 && "Unexpected opcode!"); 7974 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7975 7976 // We've pushed the return address to the stack, so add 16 to the offset. 7977 // This is safe, since we already checked if it would overflow when we 7978 // checked if this instruction was legal to outline. 7979 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 7980 StackOffsetOperand.setImm(NewImm); 7981 } 7982 } 7983 7984 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7985 bool ShouldSignReturnAddr, 7986 bool ShouldSignReturnAddrWithBKey) { 7987 if (ShouldSignReturnAddr) { 7988 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7989 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7990 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7991 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7992 DebugLoc DL; 7993 7994 if (MBBAUT != MBB.end()) 7995 DL = MBBAUT->getDebugLoc(); 7996 7997 // At the very beginning of the basic block we insert the following 7998 // depending on the key type 7999 // 8000 // a_key: b_key: 8001 // PACIASP EMITBKEY 8002 // CFI_INSTRUCTION PACIBSP 8003 // CFI_INSTRUCTION 8004 if (ShouldSignReturnAddrWithBKey) { 8005 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 8006 .setMIFlag(MachineInstr::FrameSetup); 8007 } 8008 8009 BuildMI(MBB, MBBPAC, DebugLoc(), 8010 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::PACIBSP 8011 : AArch64::PACIASP)) 8012 .setMIFlag(MachineInstr::FrameSetup); 8013 8014 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 8015 unsigned CFIIndex = 8016 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 8017 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 8018 .addCFIIndex(CFIIndex) 8019 .setMIFlags(MachineInstr::FrameSetup); 8020 } 8021 8022 // If v8.3a features are available we can replace a RET instruction by 8023 // RETAA or RETAB and omit the AUT instructions. In this case the 8024 // DW_CFA_AARCH64_negate_ra_state can't be emitted. 8025 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 8026 MBBAUT->getOpcode() == AArch64::RET) { 8027 BuildMI(MBB, MBBAUT, DL, 8028 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::RETAB 8029 : AArch64::RETAA)) 8030 .copyImplicitOps(*MBBAUT); 8031 MBB.erase(MBBAUT); 8032 } else { 8033 BuildMI(MBB, MBBAUT, DL, 8034 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::AUTIBSP 8035 : AArch64::AUTIASP)) 8036 .setMIFlag(MachineInstr::FrameDestroy); 8037 unsigned CFIIndexAuth = 8038 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 8039 BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 8040 .addCFIIndex(CFIIndexAuth) 8041 .setMIFlags(MachineInstr::FrameDestroy); 8042 } 8043 } 8044 } 8045 8046 void AArch64InstrInfo::buildOutlinedFrame( 8047 MachineBasicBlock &MBB, MachineFunction &MF, 8048 const outliner::OutlinedFunction &OF) const { 8049 8050 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 8051 8052 if (OF.FrameConstructionID == MachineOutlinerTailCall) 8053 FI->setOutliningStyle("Tail Call"); 8054 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 8055 // For thunk outlining, rewrite the last instruction from a call to a 8056 // tail-call. 8057 MachineInstr *Call = &*--MBB.instr_end(); 8058 unsigned TailOpcode; 8059 if (Call->getOpcode() == AArch64::BL) { 8060 TailOpcode = AArch64::TCRETURNdi; 8061 } else { 8062 assert(Call->getOpcode() == AArch64::BLR || 8063 Call->getOpcode() == AArch64::BLRNoIP); 8064 TailOpcode = AArch64::TCRETURNriALL; 8065 } 8066 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 8067 .add(Call->getOperand(0)) 8068 .addImm(0); 8069 MBB.insert(MBB.end(), TC); 8070 Call->eraseFromParent(); 8071 8072 FI->setOutliningStyle("Thunk"); 8073 } 8074 8075 bool IsLeafFunction = true; 8076 8077 // Is there a call in the outlined range? 8078 auto IsNonTailCall = [](const MachineInstr &MI) { 8079 return MI.isCall() && !MI.isReturn(); 8080 }; 8081 8082 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 8083 // Fix up the instructions in the range, since we're going to modify the 8084 // stack. 8085 8086 // Bugzilla ID: 46767 8087 // TODO: Check if fixing up twice is safe so we can outline these. 8088 assert(OF.FrameConstructionID != MachineOutlinerDefault && 8089 "Can only fix up stack references once"); 8090 fixupPostOutline(MBB); 8091 8092 IsLeafFunction = false; 8093 8094 // LR has to be a live in so that we can save it. 8095 if (!MBB.isLiveIn(AArch64::LR)) 8096 MBB.addLiveIn(AArch64::LR); 8097 8098 MachineBasicBlock::iterator It = MBB.begin(); 8099 MachineBasicBlock::iterator Et = MBB.end(); 8100 8101 if (OF.FrameConstructionID == MachineOutlinerTailCall || 8102 OF.FrameConstructionID == MachineOutlinerThunk) 8103 Et = std::prev(MBB.end()); 8104 8105 // Insert a save before the outlined region 8106 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 8107 .addReg(AArch64::SP, RegState::Define) 8108 .addReg(AArch64::LR) 8109 .addReg(AArch64::SP) 8110 .addImm(-16); 8111 It = MBB.insert(It, STRXpre); 8112 8113 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 8114 const TargetSubtargetInfo &STI = MF.getSubtarget(); 8115 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 8116 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 8117 8118 // Add a CFI saying the stack was moved 16 B down. 8119 int64_t StackPosEntry = 8120 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 8121 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8122 .addCFIIndex(StackPosEntry) 8123 .setMIFlags(MachineInstr::FrameSetup); 8124 8125 // Add a CFI saying that the LR that we want to find is now 16 B higher 8126 // than before. 8127 int64_t LRPosEntry = MF.addFrameInst( 8128 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 8129 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8130 .addCFIIndex(LRPosEntry) 8131 .setMIFlags(MachineInstr::FrameSetup); 8132 } 8133 8134 // Insert a restore before the terminator for the function. 8135 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8136 .addReg(AArch64::SP, RegState::Define) 8137 .addReg(AArch64::LR, RegState::Define) 8138 .addReg(AArch64::SP) 8139 .addImm(16); 8140 Et = MBB.insert(Et, LDRXpost); 8141 } 8142 8143 // If a bunch of candidates reach this point they must agree on their return 8144 // address signing. It is therefore enough to just consider the signing 8145 // behaviour of one of them 8146 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 8147 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 8148 8149 // a_key is the default 8150 bool ShouldSignReturnAddrWithBKey = MFI.shouldSignWithBKey(); 8151 8152 // If this is a tail call outlined function, then there's already a return. 8153 if (OF.FrameConstructionID == MachineOutlinerTailCall || 8154 OF.FrameConstructionID == MachineOutlinerThunk) { 8155 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8156 ShouldSignReturnAddrWithBKey); 8157 return; 8158 } 8159 8160 // It's not a tail call, so we have to insert the return ourselves. 8161 8162 // LR has to be a live in so that we can return to it. 8163 if (!MBB.isLiveIn(AArch64::LR)) 8164 MBB.addLiveIn(AArch64::LR); 8165 8166 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 8167 .addReg(AArch64::LR); 8168 MBB.insert(MBB.end(), ret); 8169 8170 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8171 ShouldSignReturnAddrWithBKey); 8172 8173 FI->setOutliningStyle("Function"); 8174 8175 // Did we have to modify the stack by saving the link register? 8176 if (OF.FrameConstructionID != MachineOutlinerDefault) 8177 return; 8178 8179 // We modified the stack. 8180 // Walk over the basic block and fix up all the stack accesses. 8181 fixupPostOutline(MBB); 8182 } 8183 8184 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 8185 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 8186 MachineFunction &MF, outliner::Candidate &C) const { 8187 8188 // Are we tail calling? 8189 if (C.CallConstructionID == MachineOutlinerTailCall) { 8190 // If yes, then we can just branch to the label. 8191 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 8192 .addGlobalAddress(M.getNamedValue(MF.getName())) 8193 .addImm(0)); 8194 return It; 8195 } 8196 8197 // Are we saving the link register? 8198 if (C.CallConstructionID == MachineOutlinerNoLRSave || 8199 C.CallConstructionID == MachineOutlinerThunk) { 8200 // No, so just insert the call. 8201 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8202 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8203 return It; 8204 } 8205 8206 // We want to return the spot where we inserted the call. 8207 MachineBasicBlock::iterator CallPt; 8208 8209 // Instructions for saving and restoring LR around the call instruction we're 8210 // going to insert. 8211 MachineInstr *Save; 8212 MachineInstr *Restore; 8213 // Can we save to a register? 8214 if (C.CallConstructionID == MachineOutlinerRegSave) { 8215 // FIXME: This logic should be sunk into a target-specific interface so that 8216 // we don't have to recompute the register. 8217 Register Reg = findRegisterToSaveLRTo(C); 8218 assert(Reg && "No callee-saved register available?"); 8219 8220 // LR has to be a live in so that we can save it. 8221 if (!MBB.isLiveIn(AArch64::LR)) 8222 MBB.addLiveIn(AArch64::LR); 8223 8224 // Save and restore LR from Reg. 8225 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 8226 .addReg(AArch64::XZR) 8227 .addReg(AArch64::LR) 8228 .addImm(0); 8229 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 8230 .addReg(AArch64::XZR) 8231 .addReg(Reg) 8232 .addImm(0); 8233 } else { 8234 // We have the default case. Save and restore from SP. 8235 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 8236 .addReg(AArch64::SP, RegState::Define) 8237 .addReg(AArch64::LR) 8238 .addReg(AArch64::SP) 8239 .addImm(-16); 8240 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8241 .addReg(AArch64::SP, RegState::Define) 8242 .addReg(AArch64::LR, RegState::Define) 8243 .addReg(AArch64::SP) 8244 .addImm(16); 8245 } 8246 8247 It = MBB.insert(It, Save); 8248 It++; 8249 8250 // Insert the call. 8251 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8252 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8253 CallPt = It; 8254 It++; 8255 8256 It = MBB.insert(It, Restore); 8257 return CallPt; 8258 } 8259 8260 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 8261 MachineFunction &MF) const { 8262 return MF.getFunction().hasMinSize(); 8263 } 8264 8265 std::optional<DestSourcePair> 8266 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 8267 8268 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 8269 // and zero immediate operands used as an alias for mov instruction. 8270 if (MI.getOpcode() == AArch64::ORRWrs && 8271 MI.getOperand(1).getReg() == AArch64::WZR && 8272 MI.getOperand(3).getImm() == 0x0) { 8273 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8274 } 8275 8276 if (MI.getOpcode() == AArch64::ORRXrs && 8277 MI.getOperand(1).getReg() == AArch64::XZR && 8278 MI.getOperand(3).getImm() == 0x0) { 8279 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8280 } 8281 8282 return std::nullopt; 8283 } 8284 8285 std::optional<RegImmPair> 8286 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 8287 int Sign = 1; 8288 int64_t Offset = 0; 8289 8290 // TODO: Handle cases where Reg is a super- or sub-register of the 8291 // destination register. 8292 const MachineOperand &Op0 = MI.getOperand(0); 8293 if (!Op0.isReg() || Reg != Op0.getReg()) 8294 return std::nullopt; 8295 8296 switch (MI.getOpcode()) { 8297 default: 8298 return std::nullopt; 8299 case AArch64::SUBWri: 8300 case AArch64::SUBXri: 8301 case AArch64::SUBSWri: 8302 case AArch64::SUBSXri: 8303 Sign *= -1; 8304 [[fallthrough]]; 8305 case AArch64::ADDSWri: 8306 case AArch64::ADDSXri: 8307 case AArch64::ADDWri: 8308 case AArch64::ADDXri: { 8309 // TODO: Third operand can be global address (usually some string). 8310 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 8311 !MI.getOperand(2).isImm()) 8312 return std::nullopt; 8313 int Shift = MI.getOperand(3).getImm(); 8314 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 8315 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 8316 } 8317 } 8318 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 8319 } 8320 8321 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 8322 /// the destination register then, if possible, describe the value in terms of 8323 /// the source register. 8324 static std::optional<ParamLoadedValue> 8325 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 8326 const TargetInstrInfo *TII, 8327 const TargetRegisterInfo *TRI) { 8328 auto DestSrc = TII->isCopyInstr(MI); 8329 if (!DestSrc) 8330 return std::nullopt; 8331 8332 Register DestReg = DestSrc->Destination->getReg(); 8333 Register SrcReg = DestSrc->Source->getReg(); 8334 8335 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 8336 8337 // If the described register is the destination, just return the source. 8338 if (DestReg == DescribedReg) 8339 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8340 8341 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 8342 if (MI.getOpcode() == AArch64::ORRWrs && 8343 TRI->isSuperRegister(DestReg, DescribedReg)) 8344 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8345 8346 // We may need to describe the lower part of a ORRXrs move. 8347 if (MI.getOpcode() == AArch64::ORRXrs && 8348 TRI->isSubRegister(DestReg, DescribedReg)) { 8349 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 8350 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 8351 } 8352 8353 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 8354 "Unhandled ORR[XW]rs copy case"); 8355 8356 return std::nullopt; 8357 } 8358 8359 std::optional<ParamLoadedValue> 8360 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 8361 Register Reg) const { 8362 const MachineFunction *MF = MI.getMF(); 8363 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 8364 switch (MI.getOpcode()) { 8365 case AArch64::MOVZWi: 8366 case AArch64::MOVZXi: { 8367 // MOVZWi may be used for producing zero-extended 32-bit immediates in 8368 // 64-bit parameters, so we need to consider super-registers. 8369 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 8370 return std::nullopt; 8371 8372 if (!MI.getOperand(1).isImm()) 8373 return std::nullopt; 8374 int64_t Immediate = MI.getOperand(1).getImm(); 8375 int Shift = MI.getOperand(2).getImm(); 8376 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 8377 nullptr); 8378 } 8379 case AArch64::ORRWrs: 8380 case AArch64::ORRXrs: 8381 return describeORRLoadedValue(MI, Reg, this, TRI); 8382 } 8383 8384 return TargetInstrInfo::describeLoadedValue(MI, Reg); 8385 } 8386 8387 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 8388 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 8389 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 8390 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 8391 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 8392 8393 // Anyexts are nops. 8394 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 8395 return true; 8396 8397 Register DefReg = ExtMI.getOperand(0).getReg(); 8398 if (!MRI.hasOneNonDBGUse(DefReg)) 8399 return false; 8400 8401 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 8402 // addressing mode. 8403 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 8404 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 8405 } 8406 8407 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 8408 return get(Opc).TSFlags & AArch64::ElementSizeMask; 8409 } 8410 8411 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 8412 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 8413 } 8414 8415 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 8416 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 8417 } 8418 8419 unsigned int 8420 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 8421 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 8422 } 8423 8424 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 8425 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 8426 return AArch64::BLRNoIP; 8427 else 8428 return AArch64::BLR; 8429 } 8430 8431 #define GET_INSTRINFO_HELPERS 8432 #define GET_INSTRMAP_INFO 8433 #include "AArch64GenInstrInfo.inc" 8434