1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineCombinerPattern.h" 23 #include "llvm/CodeGen/MachineFrameInfo.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/MachineOperand.h" 30 #include "llvm/CodeGen/MachineRegisterInfo.h" 31 #include "llvm/CodeGen/StackMaps.h" 32 #include "llvm/CodeGen/TargetRegisterInfo.h" 33 #include "llvm/CodeGen/TargetSubtargetInfo.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/GlobalValue.h" 37 #include "llvm/MC/MCAsmInfo.h" 38 #include "llvm/MC/MCInst.h" 39 #include "llvm/MC/MCInstBuilder.h" 40 #include "llvm/MC/MCInstrDesc.h" 41 #include "llvm/Support/Casting.h" 42 #include "llvm/Support/CodeGen.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Compiler.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/LEB128.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include "llvm/Target/TargetOptions.h" 50 #include <cassert> 51 #include <cstdint> 52 #include <iterator> 53 #include <utility> 54 55 using namespace llvm; 56 57 #define GET_INSTRINFO_CTOR_DTOR 58 #include "AArch64GenInstrInfo.inc" 59 60 static cl::opt<unsigned> TBZDisplacementBits( 61 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 62 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 63 64 static cl::opt<unsigned> CBZDisplacementBits( 65 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 66 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 67 68 static cl::opt<unsigned> 69 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 70 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 71 72 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 73 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 74 AArch64::CATCHRET), 75 RI(STI.getTargetTriple()), Subtarget(STI) {} 76 77 /// GetInstSize - Return the number of bytes of code the specified 78 /// instruction may be. This returns the maximum number of bytes. 79 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 80 const MachineBasicBlock &MBB = *MI.getParent(); 81 const MachineFunction *MF = MBB.getParent(); 82 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 83 84 { 85 auto Op = MI.getOpcode(); 86 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 87 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 88 } 89 90 // Meta-instructions emit no code. 91 if (MI.isMetaInstruction()) 92 return 0; 93 94 // FIXME: We currently only handle pseudoinstructions that don't get expanded 95 // before the assembly printer. 96 unsigned NumBytes = 0; 97 const MCInstrDesc &Desc = MI.getDesc(); 98 99 // Size should be preferably set in 100 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case). 101 // Specific cases handle instructions of variable sizes 102 switch (Desc.getOpcode()) { 103 default: 104 if (Desc.getSize()) 105 return Desc.getSize(); 106 107 // Anything not explicitly designated otherwise (i.e. pseudo-instructions 108 // with fixed constant size but not specified in .td file) is a normal 109 // 4-byte insn. 110 NumBytes = 4; 111 break; 112 case TargetOpcode::STACKMAP: 113 // The upper bound for a stackmap intrinsic is the full length of its shadow 114 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 115 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 116 break; 117 case TargetOpcode::PATCHPOINT: 118 // The size of the patchpoint intrinsic is the number of bytes requested 119 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 120 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 121 break; 122 case TargetOpcode::STATEPOINT: 123 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 125 // No patch bytes means a normal call inst is emitted 126 if (NumBytes == 0) 127 NumBytes = 4; 128 break; 129 case AArch64::SPACE: 130 NumBytes = MI.getOperand(1).getImm(); 131 break; 132 case TargetOpcode::BUNDLE: 133 NumBytes = getInstBundleLength(MI); 134 break; 135 } 136 137 return NumBytes; 138 } 139 140 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 141 unsigned Size = 0; 142 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 143 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 144 while (++I != E && I->isInsideBundle()) { 145 assert(!I->isBundle() && "No nested bundle!"); 146 Size += getInstSizeInBytes(*I); 147 } 148 return Size; 149 } 150 151 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 152 SmallVectorImpl<MachineOperand> &Cond) { 153 // Block ends with fall-through condbranch. 154 switch (LastInst->getOpcode()) { 155 default: 156 llvm_unreachable("Unknown branch instruction?"); 157 case AArch64::Bcc: 158 Target = LastInst->getOperand(1).getMBB(); 159 Cond.push_back(LastInst->getOperand(0)); 160 break; 161 case AArch64::CBZW: 162 case AArch64::CBZX: 163 case AArch64::CBNZW: 164 case AArch64::CBNZX: 165 Target = LastInst->getOperand(1).getMBB(); 166 Cond.push_back(MachineOperand::CreateImm(-1)); 167 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 168 Cond.push_back(LastInst->getOperand(0)); 169 break; 170 case AArch64::TBZW: 171 case AArch64::TBZX: 172 case AArch64::TBNZW: 173 case AArch64::TBNZX: 174 Target = LastInst->getOperand(2).getMBB(); 175 Cond.push_back(MachineOperand::CreateImm(-1)); 176 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 177 Cond.push_back(LastInst->getOperand(0)); 178 Cond.push_back(LastInst->getOperand(1)); 179 } 180 } 181 182 static unsigned getBranchDisplacementBits(unsigned Opc) { 183 switch (Opc) { 184 default: 185 llvm_unreachable("unexpected opcode!"); 186 case AArch64::B: 187 return 64; 188 case AArch64::TBNZW: 189 case AArch64::TBZW: 190 case AArch64::TBNZX: 191 case AArch64::TBZX: 192 return TBZDisplacementBits; 193 case AArch64::CBNZW: 194 case AArch64::CBZW: 195 case AArch64::CBNZX: 196 case AArch64::CBZX: 197 return CBZDisplacementBits; 198 case AArch64::Bcc: 199 return BCCDisplacementBits; 200 } 201 } 202 203 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 204 int64_t BrOffset) const { 205 unsigned Bits = getBranchDisplacementBits(BranchOp); 206 assert(Bits >= 3 && "max branch displacement must be enough to jump" 207 "over conditional branch expansion"); 208 return isIntN(Bits, BrOffset / 4); 209 } 210 211 MachineBasicBlock * 212 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 213 switch (MI.getOpcode()) { 214 default: 215 llvm_unreachable("unexpected opcode!"); 216 case AArch64::B: 217 return MI.getOperand(0).getMBB(); 218 case AArch64::TBZW: 219 case AArch64::TBNZW: 220 case AArch64::TBZX: 221 case AArch64::TBNZX: 222 return MI.getOperand(2).getMBB(); 223 case AArch64::CBZW: 224 case AArch64::CBNZW: 225 case AArch64::CBZX: 226 case AArch64::CBNZX: 227 case AArch64::Bcc: 228 return MI.getOperand(1).getMBB(); 229 } 230 } 231 232 // Branch analysis. 233 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 234 MachineBasicBlock *&TBB, 235 MachineBasicBlock *&FBB, 236 SmallVectorImpl<MachineOperand> &Cond, 237 bool AllowModify) const { 238 // If the block has no terminators, it just falls into the block after it. 239 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 240 if (I == MBB.end()) 241 return false; 242 243 // Skip over SpeculationBarrierEndBB terminators 244 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 245 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 246 --I; 247 } 248 249 if (!isUnpredicatedTerminator(*I)) 250 return false; 251 252 // Get the last instruction in the block. 253 MachineInstr *LastInst = &*I; 254 255 // If there is only one terminator instruction, process it. 256 unsigned LastOpc = LastInst->getOpcode(); 257 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 258 if (isUncondBranchOpcode(LastOpc)) { 259 TBB = LastInst->getOperand(0).getMBB(); 260 return false; 261 } 262 if (isCondBranchOpcode(LastOpc)) { 263 // Block ends with fall-through condbranch. 264 parseCondBranch(LastInst, TBB, Cond); 265 return false; 266 } 267 return true; // Can't handle indirect branch. 268 } 269 270 // Get the instruction before it if it is a terminator. 271 MachineInstr *SecondLastInst = &*I; 272 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 273 274 // If AllowModify is true and the block ends with two or more unconditional 275 // branches, delete all but the first unconditional branch. 276 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 277 while (isUncondBranchOpcode(SecondLastOpc)) { 278 LastInst->eraseFromParent(); 279 LastInst = SecondLastInst; 280 LastOpc = LastInst->getOpcode(); 281 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 282 // Return now the only terminator is an unconditional branch. 283 TBB = LastInst->getOperand(0).getMBB(); 284 return false; 285 } else { 286 SecondLastInst = &*I; 287 SecondLastOpc = SecondLastInst->getOpcode(); 288 } 289 } 290 } 291 292 // If we're allowed to modify and the block ends in a unconditional branch 293 // which could simply fallthrough, remove the branch. (Note: This case only 294 // matters when we can't understand the whole sequence, otherwise it's also 295 // handled by BranchFolding.cpp.) 296 if (AllowModify && isUncondBranchOpcode(LastOpc) && 297 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 298 LastInst->eraseFromParent(); 299 LastInst = SecondLastInst; 300 LastOpc = LastInst->getOpcode(); 301 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 302 assert(!isUncondBranchOpcode(LastOpc) && 303 "unreachable unconditional branches removed above"); 304 305 if (isCondBranchOpcode(LastOpc)) { 306 // Block ends with fall-through condbranch. 307 parseCondBranch(LastInst, TBB, Cond); 308 return false; 309 } 310 return true; // Can't handle indirect branch. 311 } else { 312 SecondLastInst = &*I; 313 SecondLastOpc = SecondLastInst->getOpcode(); 314 } 315 } 316 317 // If there are three terminators, we don't know what sort of block this is. 318 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 319 return true; 320 321 // If the block ends with a B and a Bcc, handle it. 322 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 323 parseCondBranch(SecondLastInst, TBB, Cond); 324 FBB = LastInst->getOperand(0).getMBB(); 325 return false; 326 } 327 328 // If the block ends with two unconditional branches, handle it. The second 329 // one is not executed, so remove it. 330 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 331 TBB = SecondLastInst->getOperand(0).getMBB(); 332 I = LastInst; 333 if (AllowModify) 334 I->eraseFromParent(); 335 return false; 336 } 337 338 // ...likewise if it ends with an indirect branch followed by an unconditional 339 // branch. 340 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 341 I = LastInst; 342 if (AllowModify) 343 I->eraseFromParent(); 344 return true; 345 } 346 347 // Otherwise, can't handle this. 348 return true; 349 } 350 351 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 352 MachineBranchPredicate &MBP, 353 bool AllowModify) const { 354 // For the moment, handle only a block which ends with a cb(n)zx followed by 355 // a fallthrough. Why this? Because it is a common form. 356 // TODO: Should we handle b.cc? 357 358 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 359 if (I == MBB.end()) 360 return true; 361 362 // Skip over SpeculationBarrierEndBB terminators 363 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 364 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 365 --I; 366 } 367 368 if (!isUnpredicatedTerminator(*I)) 369 return true; 370 371 // Get the last instruction in the block. 372 MachineInstr *LastInst = &*I; 373 unsigned LastOpc = LastInst->getOpcode(); 374 if (!isCondBranchOpcode(LastOpc)) 375 return true; 376 377 switch (LastOpc) { 378 default: 379 return true; 380 case AArch64::CBZW: 381 case AArch64::CBZX: 382 case AArch64::CBNZW: 383 case AArch64::CBNZX: 384 break; 385 }; 386 387 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 388 assert(MBP.TrueDest && "expected!"); 389 MBP.FalseDest = MBB.getNextNode(); 390 391 MBP.ConditionDef = nullptr; 392 MBP.SingleUseCondition = false; 393 394 MBP.LHS = LastInst->getOperand(0); 395 MBP.RHS = MachineOperand::CreateImm(0); 396 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 397 : MachineBranchPredicate::PRED_EQ; 398 return false; 399 } 400 401 bool AArch64InstrInfo::reverseBranchCondition( 402 SmallVectorImpl<MachineOperand> &Cond) const { 403 if (Cond[0].getImm() != -1) { 404 // Regular Bcc 405 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 406 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 407 } else { 408 // Folded compare-and-branch 409 switch (Cond[1].getImm()) { 410 default: 411 llvm_unreachable("Unknown conditional branch!"); 412 case AArch64::CBZW: 413 Cond[1].setImm(AArch64::CBNZW); 414 break; 415 case AArch64::CBNZW: 416 Cond[1].setImm(AArch64::CBZW); 417 break; 418 case AArch64::CBZX: 419 Cond[1].setImm(AArch64::CBNZX); 420 break; 421 case AArch64::CBNZX: 422 Cond[1].setImm(AArch64::CBZX); 423 break; 424 case AArch64::TBZW: 425 Cond[1].setImm(AArch64::TBNZW); 426 break; 427 case AArch64::TBNZW: 428 Cond[1].setImm(AArch64::TBZW); 429 break; 430 case AArch64::TBZX: 431 Cond[1].setImm(AArch64::TBNZX); 432 break; 433 case AArch64::TBNZX: 434 Cond[1].setImm(AArch64::TBZX); 435 break; 436 } 437 } 438 439 return false; 440 } 441 442 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 443 int *BytesRemoved) const { 444 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 445 if (I == MBB.end()) 446 return 0; 447 448 if (!isUncondBranchOpcode(I->getOpcode()) && 449 !isCondBranchOpcode(I->getOpcode())) 450 return 0; 451 452 // Remove the branch. 453 I->eraseFromParent(); 454 455 I = MBB.end(); 456 457 if (I == MBB.begin()) { 458 if (BytesRemoved) 459 *BytesRemoved = 4; 460 return 1; 461 } 462 --I; 463 if (!isCondBranchOpcode(I->getOpcode())) { 464 if (BytesRemoved) 465 *BytesRemoved = 4; 466 return 1; 467 } 468 469 // Remove the branch. 470 I->eraseFromParent(); 471 if (BytesRemoved) 472 *BytesRemoved = 8; 473 474 return 2; 475 } 476 477 void AArch64InstrInfo::instantiateCondBranch( 478 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 479 ArrayRef<MachineOperand> Cond) const { 480 if (Cond[0].getImm() != -1) { 481 // Regular Bcc 482 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 483 } else { 484 // Folded compare-and-branch 485 // Note that we use addOperand instead of addReg to keep the flags. 486 const MachineInstrBuilder MIB = 487 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 488 if (Cond.size() > 3) 489 MIB.addImm(Cond[3].getImm()); 490 MIB.addMBB(TBB); 491 } 492 } 493 494 unsigned AArch64InstrInfo::insertBranch( 495 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 496 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 497 // Shouldn't be a fall through. 498 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 499 500 if (!FBB) { 501 if (Cond.empty()) // Unconditional branch? 502 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 503 else 504 instantiateCondBranch(MBB, DL, TBB, Cond); 505 506 if (BytesAdded) 507 *BytesAdded = 4; 508 509 return 1; 510 } 511 512 // Two-way conditional branch. 513 instantiateCondBranch(MBB, DL, TBB, Cond); 514 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 515 516 if (BytesAdded) 517 *BytesAdded = 8; 518 519 return 2; 520 } 521 522 // Find the original register that VReg is copied from. 523 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 524 while (Register::isVirtualRegister(VReg)) { 525 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 526 if (!DefMI->isFullCopy()) 527 return VReg; 528 VReg = DefMI->getOperand(1).getReg(); 529 } 530 return VReg; 531 } 532 533 // Determine if VReg is defined by an instruction that can be folded into a 534 // csel instruction. If so, return the folded opcode, and the replacement 535 // register. 536 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 537 unsigned *NewVReg = nullptr) { 538 VReg = removeCopies(MRI, VReg); 539 if (!Register::isVirtualRegister(VReg)) 540 return 0; 541 542 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 543 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 544 unsigned Opc = 0; 545 unsigned SrcOpNum = 0; 546 switch (DefMI->getOpcode()) { 547 case AArch64::ADDSXri: 548 case AArch64::ADDSWri: 549 // if NZCV is used, do not fold. 550 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 551 return 0; 552 // fall-through to ADDXri and ADDWri. 553 [[fallthrough]]; 554 case AArch64::ADDXri: 555 case AArch64::ADDWri: 556 // add x, 1 -> csinc. 557 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 558 DefMI->getOperand(3).getImm() != 0) 559 return 0; 560 SrcOpNum = 1; 561 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 562 break; 563 564 case AArch64::ORNXrr: 565 case AArch64::ORNWrr: { 566 // not x -> csinv, represented as orn dst, xzr, src. 567 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 568 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 569 return 0; 570 SrcOpNum = 2; 571 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 572 break; 573 } 574 575 case AArch64::SUBSXrr: 576 case AArch64::SUBSWrr: 577 // if NZCV is used, do not fold. 578 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 579 return 0; 580 // fall-through to SUBXrr and SUBWrr. 581 [[fallthrough]]; 582 case AArch64::SUBXrr: 583 case AArch64::SUBWrr: { 584 // neg x -> csneg, represented as sub dst, xzr, src. 585 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 586 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 587 return 0; 588 SrcOpNum = 2; 589 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 590 break; 591 } 592 default: 593 return 0; 594 } 595 assert(Opc && SrcOpNum && "Missing parameters"); 596 597 if (NewVReg) 598 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 599 return Opc; 600 } 601 602 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 603 ArrayRef<MachineOperand> Cond, 604 Register DstReg, Register TrueReg, 605 Register FalseReg, int &CondCycles, 606 int &TrueCycles, 607 int &FalseCycles) const { 608 // Check register classes. 609 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 610 const TargetRegisterClass *RC = 611 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 612 if (!RC) 613 return false; 614 615 // Also need to check the dest regclass, in case we're trying to optimize 616 // something like: 617 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 618 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 619 return false; 620 621 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 622 unsigned ExtraCondLat = Cond.size() != 1; 623 624 // GPRs are handled by csel. 625 // FIXME: Fold in x+1, -x, and ~x when applicable. 626 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 627 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 628 // Single-cycle csel, csinc, csinv, and csneg. 629 CondCycles = 1 + ExtraCondLat; 630 TrueCycles = FalseCycles = 1; 631 if (canFoldIntoCSel(MRI, TrueReg)) 632 TrueCycles = 0; 633 else if (canFoldIntoCSel(MRI, FalseReg)) 634 FalseCycles = 0; 635 return true; 636 } 637 638 // Scalar floating point is handled by fcsel. 639 // FIXME: Form fabs, fmin, and fmax when applicable. 640 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 641 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 642 CondCycles = 5 + ExtraCondLat; 643 TrueCycles = FalseCycles = 2; 644 return true; 645 } 646 647 // Can't do vectors. 648 return false; 649 } 650 651 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 652 MachineBasicBlock::iterator I, 653 const DebugLoc &DL, Register DstReg, 654 ArrayRef<MachineOperand> Cond, 655 Register TrueReg, Register FalseReg) const { 656 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 657 658 // Parse the condition code, see parseCondBranch() above. 659 AArch64CC::CondCode CC; 660 switch (Cond.size()) { 661 default: 662 llvm_unreachable("Unknown condition opcode in Cond"); 663 case 1: // b.cc 664 CC = AArch64CC::CondCode(Cond[0].getImm()); 665 break; 666 case 3: { // cbz/cbnz 667 // We must insert a compare against 0. 668 bool Is64Bit; 669 switch (Cond[1].getImm()) { 670 default: 671 llvm_unreachable("Unknown branch opcode in Cond"); 672 case AArch64::CBZW: 673 Is64Bit = false; 674 CC = AArch64CC::EQ; 675 break; 676 case AArch64::CBZX: 677 Is64Bit = true; 678 CC = AArch64CC::EQ; 679 break; 680 case AArch64::CBNZW: 681 Is64Bit = false; 682 CC = AArch64CC::NE; 683 break; 684 case AArch64::CBNZX: 685 Is64Bit = true; 686 CC = AArch64CC::NE; 687 break; 688 } 689 Register SrcReg = Cond[2].getReg(); 690 if (Is64Bit) { 691 // cmp reg, #0 is actually subs xzr, reg, #0. 692 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 693 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 694 .addReg(SrcReg) 695 .addImm(0) 696 .addImm(0); 697 } else { 698 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 699 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 700 .addReg(SrcReg) 701 .addImm(0) 702 .addImm(0); 703 } 704 break; 705 } 706 case 4: { // tbz/tbnz 707 // We must insert a tst instruction. 708 switch (Cond[1].getImm()) { 709 default: 710 llvm_unreachable("Unknown branch opcode in Cond"); 711 case AArch64::TBZW: 712 case AArch64::TBZX: 713 CC = AArch64CC::EQ; 714 break; 715 case AArch64::TBNZW: 716 case AArch64::TBNZX: 717 CC = AArch64CC::NE; 718 break; 719 } 720 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 721 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 722 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 723 .addReg(Cond[2].getReg()) 724 .addImm( 725 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 726 else 727 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 728 .addReg(Cond[2].getReg()) 729 .addImm( 730 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 731 break; 732 } 733 } 734 735 unsigned Opc = 0; 736 const TargetRegisterClass *RC = nullptr; 737 bool TryFold = false; 738 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 739 RC = &AArch64::GPR64RegClass; 740 Opc = AArch64::CSELXr; 741 TryFold = true; 742 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 743 RC = &AArch64::GPR32RegClass; 744 Opc = AArch64::CSELWr; 745 TryFold = true; 746 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 747 RC = &AArch64::FPR64RegClass; 748 Opc = AArch64::FCSELDrrr; 749 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 750 RC = &AArch64::FPR32RegClass; 751 Opc = AArch64::FCSELSrrr; 752 } 753 assert(RC && "Unsupported regclass"); 754 755 // Try folding simple instructions into the csel. 756 if (TryFold) { 757 unsigned NewVReg = 0; 758 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 759 if (FoldedOpc) { 760 // The folded opcodes csinc, csinc and csneg apply the operation to 761 // FalseReg, so we need to invert the condition. 762 CC = AArch64CC::getInvertedCondCode(CC); 763 TrueReg = FalseReg; 764 } else 765 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 766 767 // Fold the operation. Leave any dead instructions for DCE to clean up. 768 if (FoldedOpc) { 769 FalseReg = NewVReg; 770 Opc = FoldedOpc; 771 // The extends the live range of NewVReg. 772 MRI.clearKillFlags(NewVReg); 773 } 774 } 775 776 // Pull all virtual register into the appropriate class. 777 MRI.constrainRegClass(TrueReg, RC); 778 MRI.constrainRegClass(FalseReg, RC); 779 780 // Insert the csel. 781 BuildMI(MBB, I, DL, get(Opc), DstReg) 782 .addReg(TrueReg) 783 .addReg(FalseReg) 784 .addImm(CC); 785 } 786 787 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 788 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 789 uint64_t Imm = MI.getOperand(1).getImm(); 790 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 791 uint64_t Encoding; 792 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 793 } 794 795 // FIXME: this implementation should be micro-architecture dependent, so a 796 // micro-architecture target hook should be introduced here in future. 797 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 798 if (!Subtarget.hasCustomCheapAsMoveHandling()) 799 return MI.isAsCheapAsAMove(); 800 801 const unsigned Opcode = MI.getOpcode(); 802 803 // Firstly, check cases gated by features. 804 805 if (Subtarget.hasZeroCycleZeroingFP()) { 806 if (Opcode == AArch64::FMOVH0 || 807 Opcode == AArch64::FMOVS0 || 808 Opcode == AArch64::FMOVD0) 809 return true; 810 } 811 812 if (Subtarget.hasZeroCycleZeroingGP()) { 813 if (Opcode == TargetOpcode::COPY && 814 (MI.getOperand(1).getReg() == AArch64::WZR || 815 MI.getOperand(1).getReg() == AArch64::XZR)) 816 return true; 817 } 818 819 // Secondly, check cases specific to sub-targets. 820 821 if (Subtarget.hasExynosCheapAsMoveHandling()) { 822 if (isExynosCheapAsMove(MI)) 823 return true; 824 825 return MI.isAsCheapAsAMove(); 826 } 827 828 // Finally, check generic cases. 829 830 switch (Opcode) { 831 default: 832 return false; 833 834 // add/sub on register without shift 835 case AArch64::ADDWri: 836 case AArch64::ADDXri: 837 case AArch64::SUBWri: 838 case AArch64::SUBXri: 839 return (MI.getOperand(3).getImm() == 0); 840 841 // logical ops on immediate 842 case AArch64::ANDWri: 843 case AArch64::ANDXri: 844 case AArch64::EORWri: 845 case AArch64::EORXri: 846 case AArch64::ORRWri: 847 case AArch64::ORRXri: 848 return true; 849 850 // logical ops on register without shift 851 case AArch64::ANDWrr: 852 case AArch64::ANDXrr: 853 case AArch64::BICWrr: 854 case AArch64::BICXrr: 855 case AArch64::EONWrr: 856 case AArch64::EONXrr: 857 case AArch64::EORWrr: 858 case AArch64::EORXrr: 859 case AArch64::ORNWrr: 860 case AArch64::ORNXrr: 861 case AArch64::ORRWrr: 862 case AArch64::ORRXrr: 863 return true; 864 865 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 866 // ORRXri, it is as cheap as MOV 867 case AArch64::MOVi32imm: 868 return canBeExpandedToORR(MI, 32); 869 case AArch64::MOVi64imm: 870 return canBeExpandedToORR(MI, 64); 871 } 872 873 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 874 } 875 876 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 877 switch (MI.getOpcode()) { 878 default: 879 return false; 880 881 case AArch64::ADDWrs: 882 case AArch64::ADDXrs: 883 case AArch64::ADDSWrs: 884 case AArch64::ADDSXrs: { 885 unsigned Imm = MI.getOperand(3).getImm(); 886 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 887 if (ShiftVal == 0) 888 return true; 889 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 890 } 891 892 case AArch64::ADDWrx: 893 case AArch64::ADDXrx: 894 case AArch64::ADDXrx64: 895 case AArch64::ADDSWrx: 896 case AArch64::ADDSXrx: 897 case AArch64::ADDSXrx64: { 898 unsigned Imm = MI.getOperand(3).getImm(); 899 switch (AArch64_AM::getArithExtendType(Imm)) { 900 default: 901 return false; 902 case AArch64_AM::UXTB: 903 case AArch64_AM::UXTH: 904 case AArch64_AM::UXTW: 905 case AArch64_AM::UXTX: 906 return AArch64_AM::getArithShiftValue(Imm) <= 4; 907 } 908 } 909 910 case AArch64::SUBWrs: 911 case AArch64::SUBSWrs: { 912 unsigned Imm = MI.getOperand(3).getImm(); 913 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 914 return ShiftVal == 0 || 915 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 916 } 917 918 case AArch64::SUBXrs: 919 case AArch64::SUBSXrs: { 920 unsigned Imm = MI.getOperand(3).getImm(); 921 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 922 return ShiftVal == 0 || 923 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 924 } 925 926 case AArch64::SUBWrx: 927 case AArch64::SUBXrx: 928 case AArch64::SUBXrx64: 929 case AArch64::SUBSWrx: 930 case AArch64::SUBSXrx: 931 case AArch64::SUBSXrx64: { 932 unsigned Imm = MI.getOperand(3).getImm(); 933 switch (AArch64_AM::getArithExtendType(Imm)) { 934 default: 935 return false; 936 case AArch64_AM::UXTB: 937 case AArch64_AM::UXTH: 938 case AArch64_AM::UXTW: 939 case AArch64_AM::UXTX: 940 return AArch64_AM::getArithShiftValue(Imm) == 0; 941 } 942 } 943 944 case AArch64::LDRBBroW: 945 case AArch64::LDRBBroX: 946 case AArch64::LDRBroW: 947 case AArch64::LDRBroX: 948 case AArch64::LDRDroW: 949 case AArch64::LDRDroX: 950 case AArch64::LDRHHroW: 951 case AArch64::LDRHHroX: 952 case AArch64::LDRHroW: 953 case AArch64::LDRHroX: 954 case AArch64::LDRQroW: 955 case AArch64::LDRQroX: 956 case AArch64::LDRSBWroW: 957 case AArch64::LDRSBWroX: 958 case AArch64::LDRSBXroW: 959 case AArch64::LDRSBXroX: 960 case AArch64::LDRSHWroW: 961 case AArch64::LDRSHWroX: 962 case AArch64::LDRSHXroW: 963 case AArch64::LDRSHXroX: 964 case AArch64::LDRSWroW: 965 case AArch64::LDRSWroX: 966 case AArch64::LDRSroW: 967 case AArch64::LDRSroX: 968 case AArch64::LDRWroW: 969 case AArch64::LDRWroX: 970 case AArch64::LDRXroW: 971 case AArch64::LDRXroX: 972 case AArch64::PRFMroW: 973 case AArch64::PRFMroX: 974 case AArch64::STRBBroW: 975 case AArch64::STRBBroX: 976 case AArch64::STRBroW: 977 case AArch64::STRBroX: 978 case AArch64::STRDroW: 979 case AArch64::STRDroX: 980 case AArch64::STRHHroW: 981 case AArch64::STRHHroX: 982 case AArch64::STRHroW: 983 case AArch64::STRHroX: 984 case AArch64::STRQroW: 985 case AArch64::STRQroX: 986 case AArch64::STRSroW: 987 case AArch64::STRSroX: 988 case AArch64::STRWroW: 989 case AArch64::STRWroX: 990 case AArch64::STRXroW: 991 case AArch64::STRXroX: { 992 unsigned IsSigned = MI.getOperand(3).getImm(); 993 return !IsSigned; 994 } 995 } 996 } 997 998 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 999 unsigned Opc = MI.getOpcode(); 1000 switch (Opc) { 1001 default: 1002 return false; 1003 case AArch64::SEH_StackAlloc: 1004 case AArch64::SEH_SaveFPLR: 1005 case AArch64::SEH_SaveFPLR_X: 1006 case AArch64::SEH_SaveReg: 1007 case AArch64::SEH_SaveReg_X: 1008 case AArch64::SEH_SaveRegP: 1009 case AArch64::SEH_SaveRegP_X: 1010 case AArch64::SEH_SaveFReg: 1011 case AArch64::SEH_SaveFReg_X: 1012 case AArch64::SEH_SaveFRegP: 1013 case AArch64::SEH_SaveFRegP_X: 1014 case AArch64::SEH_SetFP: 1015 case AArch64::SEH_AddFP: 1016 case AArch64::SEH_Nop: 1017 case AArch64::SEH_PrologEnd: 1018 case AArch64::SEH_EpilogStart: 1019 case AArch64::SEH_EpilogEnd: 1020 case AArch64::SEH_PACSignLR: 1021 return true; 1022 } 1023 } 1024 1025 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1026 Register &SrcReg, Register &DstReg, 1027 unsigned &SubIdx) const { 1028 switch (MI.getOpcode()) { 1029 default: 1030 return false; 1031 case AArch64::SBFMXri: // aka sxtw 1032 case AArch64::UBFMXri: // aka uxtw 1033 // Check for the 32 -> 64 bit extension case, these instructions can do 1034 // much more. 1035 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1036 return false; 1037 // This is a signed or unsigned 32 -> 64 bit extension. 1038 SrcReg = MI.getOperand(1).getReg(); 1039 DstReg = MI.getOperand(0).getReg(); 1040 SubIdx = AArch64::sub_32; 1041 return true; 1042 } 1043 } 1044 1045 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1046 const MachineInstr &MIa, const MachineInstr &MIb) const { 1047 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1048 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1049 int64_t OffsetA = 0, OffsetB = 0; 1050 unsigned WidthA = 0, WidthB = 0; 1051 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1052 1053 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1054 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1055 1056 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1057 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1058 return false; 1059 1060 // Retrieve the base, offset from the base and width. Width 1061 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1062 // base are identical, and the offset of a lower memory access + 1063 // the width doesn't overlap the offset of a higher memory access, 1064 // then the memory accesses are different. 1065 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1066 // are assumed to have the same scale (vscale). 1067 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1068 WidthA, TRI) && 1069 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1070 WidthB, TRI)) { 1071 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1072 OffsetAIsScalable == OffsetBIsScalable) { 1073 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1074 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1075 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1076 if (LowOffset + LowWidth <= HighOffset) 1077 return true; 1078 } 1079 } 1080 return false; 1081 } 1082 1083 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1084 const MachineBasicBlock *MBB, 1085 const MachineFunction &MF) const { 1086 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1087 return true; 1088 switch (MI.getOpcode()) { 1089 case AArch64::HINT: 1090 // CSDB hints are scheduling barriers. 1091 if (MI.getOperand(0).getImm() == 0x14) 1092 return true; 1093 break; 1094 case AArch64::DSB: 1095 case AArch64::ISB: 1096 // DSB and ISB also are scheduling barriers. 1097 return true; 1098 case AArch64::MSRpstatesvcrImm1: 1099 // SMSTART and SMSTOP are also scheduling barriers. 1100 return true; 1101 default:; 1102 } 1103 if (isSEHInstruction(MI)) 1104 return true; 1105 auto Next = std::next(MI.getIterator()); 1106 return Next != MBB->end() && Next->isCFIInstruction(); 1107 } 1108 1109 /// analyzeCompare - For a comparison instruction, return the source registers 1110 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1111 /// Return true if the comparison instruction can be analyzed. 1112 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1113 Register &SrcReg2, int64_t &CmpMask, 1114 int64_t &CmpValue) const { 1115 // The first operand can be a frame index where we'd normally expect a 1116 // register. 1117 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1118 if (!MI.getOperand(1).isReg()) 1119 return false; 1120 1121 switch (MI.getOpcode()) { 1122 default: 1123 break; 1124 case AArch64::PTEST_PP: 1125 case AArch64::PTEST_PP_ANY: 1126 SrcReg = MI.getOperand(0).getReg(); 1127 SrcReg2 = MI.getOperand(1).getReg(); 1128 // Not sure about the mask and value for now... 1129 CmpMask = ~0; 1130 CmpValue = 0; 1131 return true; 1132 case AArch64::SUBSWrr: 1133 case AArch64::SUBSWrs: 1134 case AArch64::SUBSWrx: 1135 case AArch64::SUBSXrr: 1136 case AArch64::SUBSXrs: 1137 case AArch64::SUBSXrx: 1138 case AArch64::ADDSWrr: 1139 case AArch64::ADDSWrs: 1140 case AArch64::ADDSWrx: 1141 case AArch64::ADDSXrr: 1142 case AArch64::ADDSXrs: 1143 case AArch64::ADDSXrx: 1144 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1145 SrcReg = MI.getOperand(1).getReg(); 1146 SrcReg2 = MI.getOperand(2).getReg(); 1147 CmpMask = ~0; 1148 CmpValue = 0; 1149 return true; 1150 case AArch64::SUBSWri: 1151 case AArch64::ADDSWri: 1152 case AArch64::SUBSXri: 1153 case AArch64::ADDSXri: 1154 SrcReg = MI.getOperand(1).getReg(); 1155 SrcReg2 = 0; 1156 CmpMask = ~0; 1157 CmpValue = MI.getOperand(2).getImm(); 1158 return true; 1159 case AArch64::ANDSWri: 1160 case AArch64::ANDSXri: 1161 // ANDS does not use the same encoding scheme as the others xxxS 1162 // instructions. 1163 SrcReg = MI.getOperand(1).getReg(); 1164 SrcReg2 = 0; 1165 CmpMask = ~0; 1166 CmpValue = AArch64_AM::decodeLogicalImmediate( 1167 MI.getOperand(2).getImm(), 1168 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64); 1169 return true; 1170 } 1171 1172 return false; 1173 } 1174 1175 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1176 MachineBasicBlock *MBB = Instr.getParent(); 1177 assert(MBB && "Can't get MachineBasicBlock here"); 1178 MachineFunction *MF = MBB->getParent(); 1179 assert(MF && "Can't get MachineFunction here"); 1180 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1181 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1182 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1183 1184 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1185 ++OpIdx) { 1186 MachineOperand &MO = Instr.getOperand(OpIdx); 1187 const TargetRegisterClass *OpRegCstraints = 1188 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1189 1190 // If there's no constraint, there's nothing to do. 1191 if (!OpRegCstraints) 1192 continue; 1193 // If the operand is a frame index, there's nothing to do here. 1194 // A frame index operand will resolve correctly during PEI. 1195 if (MO.isFI()) 1196 continue; 1197 1198 assert(MO.isReg() && 1199 "Operand has register constraints without being a register!"); 1200 1201 Register Reg = MO.getReg(); 1202 if (Reg.isPhysical()) { 1203 if (!OpRegCstraints->contains(Reg)) 1204 return false; 1205 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1206 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1207 return false; 1208 } 1209 1210 return true; 1211 } 1212 1213 /// Return the opcode that does not set flags when possible - otherwise 1214 /// return the original opcode. The caller is responsible to do the actual 1215 /// substitution and legality checking. 1216 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1217 // Don't convert all compare instructions, because for some the zero register 1218 // encoding becomes the sp register. 1219 bool MIDefinesZeroReg = false; 1220 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1221 MIDefinesZeroReg = true; 1222 1223 switch (MI.getOpcode()) { 1224 default: 1225 return MI.getOpcode(); 1226 case AArch64::ADDSWrr: 1227 return AArch64::ADDWrr; 1228 case AArch64::ADDSWri: 1229 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1230 case AArch64::ADDSWrs: 1231 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1232 case AArch64::ADDSWrx: 1233 return AArch64::ADDWrx; 1234 case AArch64::ADDSXrr: 1235 return AArch64::ADDXrr; 1236 case AArch64::ADDSXri: 1237 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1238 case AArch64::ADDSXrs: 1239 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1240 case AArch64::ADDSXrx: 1241 return AArch64::ADDXrx; 1242 case AArch64::SUBSWrr: 1243 return AArch64::SUBWrr; 1244 case AArch64::SUBSWri: 1245 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1246 case AArch64::SUBSWrs: 1247 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1248 case AArch64::SUBSWrx: 1249 return AArch64::SUBWrx; 1250 case AArch64::SUBSXrr: 1251 return AArch64::SUBXrr; 1252 case AArch64::SUBSXri: 1253 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1254 case AArch64::SUBSXrs: 1255 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1256 case AArch64::SUBSXrx: 1257 return AArch64::SUBXrx; 1258 } 1259 } 1260 1261 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1262 1263 /// True when condition flags are accessed (either by writing or reading) 1264 /// on the instruction trace starting at From and ending at To. 1265 /// 1266 /// Note: If From and To are from different blocks it's assumed CC are accessed 1267 /// on the path. 1268 static bool areCFlagsAccessedBetweenInstrs( 1269 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1270 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1271 // Early exit if To is at the beginning of the BB. 1272 if (To == To->getParent()->begin()) 1273 return true; 1274 1275 // Check whether the instructions are in the same basic block 1276 // If not, assume the condition flags might get modified somewhere. 1277 if (To->getParent() != From->getParent()) 1278 return true; 1279 1280 // From must be above To. 1281 assert(std::any_of( 1282 ++To.getReverse(), To->getParent()->rend(), 1283 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1284 1285 // We iterate backward starting at \p To until we hit \p From. 1286 for (const MachineInstr &Instr : 1287 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1288 if (((AccessToCheck & AK_Write) && 1289 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1290 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1291 return true; 1292 } 1293 return false; 1294 } 1295 1296 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1297 /// operation which could set the flags in an identical manner 1298 bool AArch64InstrInfo::optimizePTestInstr( 1299 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1300 const MachineRegisterInfo *MRI) const { 1301 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1302 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1303 auto NewOp = Pred->getOpcode(); 1304 bool OpChanged = false; 1305 1306 unsigned MaskOpcode = Mask->getOpcode(); 1307 unsigned PredOpcode = Pred->getOpcode(); 1308 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1309 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1310 1311 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) && 1312 getElementSizeForOpcode(MaskOpcode) == 1313 getElementSizeForOpcode(PredOpcode) && 1314 Mask->getOperand(1).getImm() == 31) { 1315 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is 1316 // redundant since WHILE performs an implicit PTEST with an all active 1317 // mask. Must be an all active predicate of matching element size. 1318 1319 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the 1320 // PTEST_LIKE instruction uses the same all active mask and the element 1321 // size matches. If the PTEST has a condition of any then it is always 1322 // redundant. 1323 if (PredIsPTestLike) { 1324 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1325 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY) 1326 return false; 1327 } 1328 1329 // Fallthough to simply remove the PTEST. 1330 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) && 1331 PTest->getOpcode() == AArch64::PTEST_PP_ANY) { 1332 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1333 // instruction that sets the flags as PTEST would. This is only valid when 1334 // the condition is any. 1335 1336 // Fallthough to simply remove the PTEST. 1337 } else if (PredIsPTestLike) { 1338 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the 1339 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate 1340 // on 8-bit predicates like the PTEST. Otherwise, for instructions like 1341 // compare that also support 16/32/64-bit predicates, the implicit PTEST 1342 // performed by the compare could consider fewer lanes for these element 1343 // sizes. 1344 // 1345 // For example, consider 1346 // 1347 // ptrue p0.b ; P0=1111-1111-1111-1111 1348 // index z0.s, #0, #1 ; Z0=<0,1,2,3> 1349 // index z1.s, #1, #1 ; Z1=<1,2,3,4> 1350 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001 1351 // ; ^ last active 1352 // ptest p0, p1.b ; P1=0001-0001-0001-0001 1353 // ; ^ last active 1354 // 1355 // where the compare generates a canonical all active 32-bit predicate 1356 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last 1357 // active flag, whereas the PTEST instruction with the same mask doesn't. 1358 // For PTEST_ANY this doesn't apply as the flags in this case would be 1359 // identical regardless of element size. 1360 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1361 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1362 if ((Mask != PTestLikeMask) || 1363 (PredElementSize != AArch64::ElementSizeB && 1364 PTest->getOpcode() != AArch64::PTEST_PP_ANY)) 1365 return false; 1366 1367 // Fallthough to simply remove the PTEST. 1368 } else { 1369 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the 1370 // opcode so the PTEST becomes redundant. 1371 switch (PredOpcode) { 1372 case AArch64::AND_PPzPP: 1373 case AArch64::BIC_PPzPP: 1374 case AArch64::EOR_PPzPP: 1375 case AArch64::NAND_PPzPP: 1376 case AArch64::NOR_PPzPP: 1377 case AArch64::ORN_PPzPP: 1378 case AArch64::ORR_PPzPP: 1379 case AArch64::BRKA_PPzP: 1380 case AArch64::BRKPA_PPzPP: 1381 case AArch64::BRKB_PPzP: 1382 case AArch64::BRKPB_PPzPP: 1383 case AArch64::RDFFR_PPz: { 1384 // Check to see if our mask is the same. If not the resulting flag bits 1385 // may be different and we can't remove the ptest. 1386 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1387 if (Mask != PredMask) 1388 return false; 1389 break; 1390 } 1391 case AArch64::BRKN_PPzP: { 1392 // BRKN uses an all active implicit mask to set flags unlike the other 1393 // flag-setting instructions. 1394 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B). 1395 if ((MaskOpcode != AArch64::PTRUE_B) || 1396 (Mask->getOperand(1).getImm() != 31)) 1397 return false; 1398 break; 1399 } 1400 case AArch64::PTRUE_B: 1401 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A) 1402 break; 1403 default: 1404 // Bail out if we don't recognize the input 1405 return false; 1406 } 1407 1408 NewOp = convertToFlagSettingOpc(PredOpcode); 1409 OpChanged = true; 1410 } 1411 1412 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1413 1414 // If another instruction between Pred and PTest accesses flags, don't remove 1415 // the ptest or update the earlier instruction to modify them. 1416 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1417 return false; 1418 1419 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1420 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1421 // operand to be replaced with an equivalent instruction that also sets the 1422 // flags. 1423 Pred->setDesc(get(NewOp)); 1424 PTest->eraseFromParent(); 1425 if (OpChanged) { 1426 bool succeeded = UpdateOperandRegClass(*Pred); 1427 (void)succeeded; 1428 assert(succeeded && "Operands have incompatible register classes!"); 1429 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1430 } 1431 1432 // Ensure that the flags def is live. 1433 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1434 unsigned i = 0, e = Pred->getNumOperands(); 1435 for (; i != e; ++i) { 1436 MachineOperand &MO = Pred->getOperand(i); 1437 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1438 MO.setIsDead(false); 1439 break; 1440 } 1441 } 1442 } 1443 return true; 1444 } 1445 1446 /// Try to optimize a compare instruction. A compare instruction is an 1447 /// instruction which produces AArch64::NZCV. It can be truly compare 1448 /// instruction 1449 /// when there are no uses of its destination register. 1450 /// 1451 /// The following steps are tried in order: 1452 /// 1. Convert CmpInstr into an unconditional version. 1453 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1454 /// condition code or an instruction which can be converted into such an 1455 /// instruction. 1456 /// Only comparison with zero is supported. 1457 bool AArch64InstrInfo::optimizeCompareInstr( 1458 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, 1459 int64_t CmpValue, const MachineRegisterInfo *MRI) const { 1460 assert(CmpInstr.getParent()); 1461 assert(MRI); 1462 1463 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1464 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1465 if (DeadNZCVIdx != -1) { 1466 if (CmpInstr.definesRegister(AArch64::WZR) || 1467 CmpInstr.definesRegister(AArch64::XZR)) { 1468 CmpInstr.eraseFromParent(); 1469 return true; 1470 } 1471 unsigned Opc = CmpInstr.getOpcode(); 1472 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1473 if (NewOpc == Opc) 1474 return false; 1475 const MCInstrDesc &MCID = get(NewOpc); 1476 CmpInstr.setDesc(MCID); 1477 CmpInstr.removeOperand(DeadNZCVIdx); 1478 bool succeeded = UpdateOperandRegClass(CmpInstr); 1479 (void)succeeded; 1480 assert(succeeded && "Some operands reg class are incompatible!"); 1481 return true; 1482 } 1483 1484 if (CmpInstr.getOpcode() == AArch64::PTEST_PP || 1485 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY) 1486 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1487 1488 if (SrcReg2 != 0) 1489 return false; 1490 1491 // CmpInstr is a Compare instruction if destination register is not used. 1492 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1493 return false; 1494 1495 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1496 return true; 1497 return (CmpValue == 0 || CmpValue == 1) && 1498 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1499 } 1500 1501 /// Get opcode of S version of Instr. 1502 /// If Instr is S version its opcode is returned. 1503 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1504 /// or we are not interested in it. 1505 static unsigned sForm(MachineInstr &Instr) { 1506 switch (Instr.getOpcode()) { 1507 default: 1508 return AArch64::INSTRUCTION_LIST_END; 1509 1510 case AArch64::ADDSWrr: 1511 case AArch64::ADDSWri: 1512 case AArch64::ADDSXrr: 1513 case AArch64::ADDSXri: 1514 case AArch64::SUBSWrr: 1515 case AArch64::SUBSWri: 1516 case AArch64::SUBSXrr: 1517 case AArch64::SUBSXri: 1518 return Instr.getOpcode(); 1519 1520 case AArch64::ADDWrr: 1521 return AArch64::ADDSWrr; 1522 case AArch64::ADDWri: 1523 return AArch64::ADDSWri; 1524 case AArch64::ADDXrr: 1525 return AArch64::ADDSXrr; 1526 case AArch64::ADDXri: 1527 return AArch64::ADDSXri; 1528 case AArch64::ADCWr: 1529 return AArch64::ADCSWr; 1530 case AArch64::ADCXr: 1531 return AArch64::ADCSXr; 1532 case AArch64::SUBWrr: 1533 return AArch64::SUBSWrr; 1534 case AArch64::SUBWri: 1535 return AArch64::SUBSWri; 1536 case AArch64::SUBXrr: 1537 return AArch64::SUBSXrr; 1538 case AArch64::SUBXri: 1539 return AArch64::SUBSXri; 1540 case AArch64::SBCWr: 1541 return AArch64::SBCSWr; 1542 case AArch64::SBCXr: 1543 return AArch64::SBCSXr; 1544 case AArch64::ANDWri: 1545 return AArch64::ANDSWri; 1546 case AArch64::ANDXri: 1547 return AArch64::ANDSXri; 1548 } 1549 } 1550 1551 /// Check if AArch64::NZCV should be alive in successors of MBB. 1552 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1553 for (auto *BB : MBB->successors()) 1554 if (BB->isLiveIn(AArch64::NZCV)) 1555 return true; 1556 return false; 1557 } 1558 1559 /// \returns The condition code operand index for \p Instr if it is a branch 1560 /// or select and -1 otherwise. 1561 static int 1562 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1563 switch (Instr.getOpcode()) { 1564 default: 1565 return -1; 1566 1567 case AArch64::Bcc: { 1568 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1569 assert(Idx >= 2); 1570 return Idx - 2; 1571 } 1572 1573 case AArch64::CSINVWr: 1574 case AArch64::CSINVXr: 1575 case AArch64::CSINCWr: 1576 case AArch64::CSINCXr: 1577 case AArch64::CSELWr: 1578 case AArch64::CSELXr: 1579 case AArch64::CSNEGWr: 1580 case AArch64::CSNEGXr: 1581 case AArch64::FCSELSrrr: 1582 case AArch64::FCSELDrrr: { 1583 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1584 assert(Idx >= 1); 1585 return Idx - 1; 1586 } 1587 } 1588 } 1589 1590 /// Find a condition code used by the instruction. 1591 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1592 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1593 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1594 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1595 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1596 Instr.getOperand(CCIdx).getImm()) 1597 : AArch64CC::Invalid; 1598 } 1599 1600 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1601 assert(CC != AArch64CC::Invalid); 1602 UsedNZCV UsedFlags; 1603 switch (CC) { 1604 default: 1605 break; 1606 1607 case AArch64CC::EQ: // Z set 1608 case AArch64CC::NE: // Z clear 1609 UsedFlags.Z = true; 1610 break; 1611 1612 case AArch64CC::HI: // Z clear and C set 1613 case AArch64CC::LS: // Z set or C clear 1614 UsedFlags.Z = true; 1615 [[fallthrough]]; 1616 case AArch64CC::HS: // C set 1617 case AArch64CC::LO: // C clear 1618 UsedFlags.C = true; 1619 break; 1620 1621 case AArch64CC::MI: // N set 1622 case AArch64CC::PL: // N clear 1623 UsedFlags.N = true; 1624 break; 1625 1626 case AArch64CC::VS: // V set 1627 case AArch64CC::VC: // V clear 1628 UsedFlags.V = true; 1629 break; 1630 1631 case AArch64CC::GT: // Z clear, N and V the same 1632 case AArch64CC::LE: // Z set, N and V differ 1633 UsedFlags.Z = true; 1634 [[fallthrough]]; 1635 case AArch64CC::GE: // N and V the same 1636 case AArch64CC::LT: // N and V differ 1637 UsedFlags.N = true; 1638 UsedFlags.V = true; 1639 break; 1640 } 1641 return UsedFlags; 1642 } 1643 1644 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV 1645 /// flags are not alive in successors of the same \p CmpInstr and \p MI parent. 1646 /// \returns std::nullopt otherwise. 1647 /// 1648 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1649 std::optional<UsedNZCV> 1650 llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1651 const TargetRegisterInfo &TRI, 1652 SmallVectorImpl<MachineInstr *> *CCUseInstrs) { 1653 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1654 if (MI.getParent() != CmpParent) 1655 return std::nullopt; 1656 1657 if (areCFlagsAliveInSuccessors(CmpParent)) 1658 return std::nullopt; 1659 1660 UsedNZCV NZCVUsedAfterCmp; 1661 for (MachineInstr &Instr : instructionsWithoutDebug( 1662 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1663 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1664 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1665 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1666 return std::nullopt; 1667 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1668 if (CCUseInstrs) 1669 CCUseInstrs->push_back(&Instr); 1670 } 1671 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1672 break; 1673 } 1674 return NZCVUsedAfterCmp; 1675 } 1676 1677 static bool isADDSRegImm(unsigned Opcode) { 1678 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1679 } 1680 1681 static bool isSUBSRegImm(unsigned Opcode) { 1682 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1683 } 1684 1685 /// Check if CmpInstr can be substituted by MI. 1686 /// 1687 /// CmpInstr can be substituted: 1688 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1689 /// - and, MI and CmpInstr are from the same MachineBB 1690 /// - and, condition flags are not alive in successors of the CmpInstr parent 1691 /// - and, if MI opcode is the S form there must be no defs of flags between 1692 /// MI and CmpInstr 1693 /// or if MI opcode is not the S form there must be neither defs of flags 1694 /// nor uses of flags between MI and CmpInstr. 1695 /// - and C/V flags are not used after CmpInstr 1696 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1697 const TargetRegisterInfo &TRI) { 1698 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1699 1700 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1701 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1702 return false; 1703 1704 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI); 1705 if (!NZVCUsed || NZVCUsed->C || NZVCUsed->V) 1706 return false; 1707 1708 AccessKind AccessToCheck = AK_Write; 1709 if (sForm(MI) != MI.getOpcode()) 1710 AccessToCheck = AK_All; 1711 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1712 } 1713 1714 /// Substitute an instruction comparing to zero with another instruction 1715 /// which produces needed condition flags. 1716 /// 1717 /// Return true on success. 1718 bool AArch64InstrInfo::substituteCmpToZero( 1719 MachineInstr &CmpInstr, unsigned SrcReg, 1720 const MachineRegisterInfo &MRI) const { 1721 // Get the unique definition of SrcReg. 1722 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1723 if (!MI) 1724 return false; 1725 1726 const TargetRegisterInfo &TRI = getRegisterInfo(); 1727 1728 unsigned NewOpc = sForm(*MI); 1729 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1730 return false; 1731 1732 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1733 return false; 1734 1735 // Update the instruction to set NZCV. 1736 MI->setDesc(get(NewOpc)); 1737 CmpInstr.eraseFromParent(); 1738 bool succeeded = UpdateOperandRegClass(*MI); 1739 (void)succeeded; 1740 assert(succeeded && "Some operands reg class are incompatible!"); 1741 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1742 return true; 1743 } 1744 1745 /// \returns True if \p CmpInstr can be removed. 1746 /// 1747 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1748 /// codes used in \p CCUseInstrs must be inverted. 1749 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1750 int CmpValue, const TargetRegisterInfo &TRI, 1751 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1752 bool &IsInvertCC) { 1753 assert((CmpValue == 0 || CmpValue == 1) && 1754 "Only comparisons to 0 or 1 considered for removal!"); 1755 1756 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1757 unsigned MIOpc = MI.getOpcode(); 1758 if (MIOpc == AArch64::CSINCWr) { 1759 if (MI.getOperand(1).getReg() != AArch64::WZR || 1760 MI.getOperand(2).getReg() != AArch64::WZR) 1761 return false; 1762 } else if (MIOpc == AArch64::CSINCXr) { 1763 if (MI.getOperand(1).getReg() != AArch64::XZR || 1764 MI.getOperand(2).getReg() != AArch64::XZR) 1765 return false; 1766 } else { 1767 return false; 1768 } 1769 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1770 if (MICC == AArch64CC::Invalid) 1771 return false; 1772 1773 // NZCV needs to be defined 1774 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1775 return false; 1776 1777 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1778 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1779 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1780 if (CmpValue && !IsSubsRegImm) 1781 return false; 1782 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1783 return false; 1784 1785 // MI conditions allowed: eq, ne, mi, pl 1786 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1787 if (MIUsedNZCV.C || MIUsedNZCV.V) 1788 return false; 1789 1790 std::optional<UsedNZCV> NZCVUsedAfterCmp = 1791 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1792 // Condition flags are not used in CmpInstr basic block successors and only 1793 // Z or N flags allowed to be used after CmpInstr within its basic block 1794 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V) 1795 return false; 1796 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1797 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1798 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1799 return false; 1800 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1801 if (MIUsedNZCV.N && !CmpValue) 1802 return false; 1803 1804 // There must be no defs of flags between MI and CmpInstr 1805 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1806 return false; 1807 1808 // Condition code is inverted in the following cases: 1809 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1810 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1811 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1812 (!CmpValue && MICC == AArch64CC::NE); 1813 return true; 1814 } 1815 1816 /// Remove comparison in csinc-cmp sequence 1817 /// 1818 /// Examples: 1819 /// 1. \code 1820 /// csinc w9, wzr, wzr, ne 1821 /// cmp w9, #0 1822 /// b.eq 1823 /// \endcode 1824 /// to 1825 /// \code 1826 /// csinc w9, wzr, wzr, ne 1827 /// b.ne 1828 /// \endcode 1829 /// 1830 /// 2. \code 1831 /// csinc x2, xzr, xzr, mi 1832 /// cmp x2, #1 1833 /// b.pl 1834 /// \endcode 1835 /// to 1836 /// \code 1837 /// csinc x2, xzr, xzr, mi 1838 /// b.pl 1839 /// \endcode 1840 /// 1841 /// \param CmpInstr comparison instruction 1842 /// \return True when comparison removed 1843 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1844 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1845 const MachineRegisterInfo &MRI) const { 1846 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1847 if (!MI) 1848 return false; 1849 const TargetRegisterInfo &TRI = getRegisterInfo(); 1850 SmallVector<MachineInstr *, 4> CCUseInstrs; 1851 bool IsInvertCC = false; 1852 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1853 IsInvertCC)) 1854 return false; 1855 // Make transformation 1856 CmpInstr.eraseFromParent(); 1857 if (IsInvertCC) { 1858 // Invert condition codes in CmpInstr CC users 1859 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1860 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1861 assert(Idx >= 0 && "Unexpected instruction using CC."); 1862 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1863 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1864 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1865 CCOperand.setImm(CCUse); 1866 } 1867 } 1868 return true; 1869 } 1870 1871 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1872 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1873 MI.getOpcode() != AArch64::CATCHRET) 1874 return false; 1875 1876 MachineBasicBlock &MBB = *MI.getParent(); 1877 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1878 auto TRI = Subtarget.getRegisterInfo(); 1879 DebugLoc DL = MI.getDebugLoc(); 1880 1881 if (MI.getOpcode() == AArch64::CATCHRET) { 1882 // Skip to the first instruction before the epilog. 1883 const TargetInstrInfo *TII = 1884 MBB.getParent()->getSubtarget().getInstrInfo(); 1885 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1886 auto MBBI = MachineBasicBlock::iterator(MI); 1887 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1888 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1889 FirstEpilogSEH != MBB.begin()) 1890 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1891 if (FirstEpilogSEH != MBB.begin()) 1892 FirstEpilogSEH = std::next(FirstEpilogSEH); 1893 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1894 .addReg(AArch64::X0, RegState::Define) 1895 .addMBB(TargetMBB); 1896 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1897 .addReg(AArch64::X0, RegState::Define) 1898 .addReg(AArch64::X0) 1899 .addMBB(TargetMBB) 1900 .addImm(0); 1901 return true; 1902 } 1903 1904 Register Reg = MI.getOperand(0).getReg(); 1905 Module &M = *MBB.getParent()->getFunction().getParent(); 1906 if (M.getStackProtectorGuard() == "sysreg") { 1907 const AArch64SysReg::SysReg *SrcReg = 1908 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1909 if (!SrcReg) 1910 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1911 1912 // mrs xN, sysreg 1913 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1914 .addDef(Reg, RegState::Renamable) 1915 .addImm(SrcReg->Encoding); 1916 int Offset = M.getStackProtectorGuardOffset(); 1917 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1918 // ldr xN, [xN, #offset] 1919 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1920 .addDef(Reg) 1921 .addUse(Reg, RegState::Kill) 1922 .addImm(Offset / 8); 1923 } else if (Offset >= -256 && Offset <= 255) { 1924 // ldur xN, [xN, #offset] 1925 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1926 .addDef(Reg) 1927 .addUse(Reg, RegState::Kill) 1928 .addImm(Offset); 1929 } else if (Offset >= -4095 && Offset <= 4095) { 1930 if (Offset > 0) { 1931 // add xN, xN, #offset 1932 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1933 .addDef(Reg) 1934 .addUse(Reg, RegState::Kill) 1935 .addImm(Offset) 1936 .addImm(0); 1937 } else { 1938 // sub xN, xN, #offset 1939 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1940 .addDef(Reg) 1941 .addUse(Reg, RegState::Kill) 1942 .addImm(-Offset) 1943 .addImm(0); 1944 } 1945 // ldr xN, [xN] 1946 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1947 .addDef(Reg) 1948 .addUse(Reg, RegState::Kill) 1949 .addImm(0); 1950 } else { 1951 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1952 // than 23760. 1953 // It might be nice to use AArch64::MOVi32imm here, which would get 1954 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1955 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1956 // AArch64FrameLowering might help us find such a scratch register 1957 // though. If we failed to find a scratch register, we could emit a 1958 // stream of add instructions to build up the immediate. Or, we could try 1959 // to insert a AArch64::MOVi32imm before register allocation so that we 1960 // didn't need to scavenge for a scratch register. 1961 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1962 } 1963 MBB.erase(MI); 1964 return true; 1965 } 1966 1967 const GlobalValue *GV = 1968 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1969 const TargetMachine &TM = MBB.getParent()->getTarget(); 1970 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1971 const unsigned char MO_NC = AArch64II::MO_NC; 1972 1973 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1974 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1975 .addGlobalAddress(GV, 0, OpFlags); 1976 if (Subtarget.isTargetILP32()) { 1977 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1978 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1979 .addDef(Reg32, RegState::Dead) 1980 .addUse(Reg, RegState::Kill) 1981 .addImm(0) 1982 .addMemOperand(*MI.memoperands_begin()) 1983 .addDef(Reg, RegState::Implicit); 1984 } else { 1985 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1986 .addReg(Reg, RegState::Kill) 1987 .addImm(0) 1988 .addMemOperand(*MI.memoperands_begin()); 1989 } 1990 } else if (TM.getCodeModel() == CodeModel::Large) { 1991 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1992 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1993 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1994 .addImm(0); 1995 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1996 .addReg(Reg, RegState::Kill) 1997 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 1998 .addImm(16); 1999 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2000 .addReg(Reg, RegState::Kill) 2001 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2002 .addImm(32); 2003 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2004 .addReg(Reg, RegState::Kill) 2005 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2006 .addImm(48); 2007 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2008 .addReg(Reg, RegState::Kill) 2009 .addImm(0) 2010 .addMemOperand(*MI.memoperands_begin()); 2011 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2012 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2013 .addGlobalAddress(GV, 0, OpFlags); 2014 } else { 2015 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2016 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2017 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2018 if (Subtarget.isTargetILP32()) { 2019 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2020 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2021 .addDef(Reg32, RegState::Dead) 2022 .addUse(Reg, RegState::Kill) 2023 .addGlobalAddress(GV, 0, LoFlags) 2024 .addMemOperand(*MI.memoperands_begin()) 2025 .addDef(Reg, RegState::Implicit); 2026 } else { 2027 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2028 .addReg(Reg, RegState::Kill) 2029 .addGlobalAddress(GV, 0, LoFlags) 2030 .addMemOperand(*MI.memoperands_begin()); 2031 } 2032 } 2033 2034 MBB.erase(MI); 2035 2036 return true; 2037 } 2038 2039 // Return true if this instruction simply sets its single destination register 2040 // to zero. This is equivalent to a register rename of the zero-register. 2041 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2042 switch (MI.getOpcode()) { 2043 default: 2044 break; 2045 case AArch64::MOVZWi: 2046 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2047 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2048 assert(MI.getDesc().getNumOperands() == 3 && 2049 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2050 return true; 2051 } 2052 break; 2053 case AArch64::ANDWri: // and Rd, Rzr, #imm 2054 return MI.getOperand(1).getReg() == AArch64::WZR; 2055 case AArch64::ANDXri: 2056 return MI.getOperand(1).getReg() == AArch64::XZR; 2057 case TargetOpcode::COPY: 2058 return MI.getOperand(1).getReg() == AArch64::WZR; 2059 } 2060 return false; 2061 } 2062 2063 // Return true if this instruction simply renames a general register without 2064 // modifying bits. 2065 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2066 switch (MI.getOpcode()) { 2067 default: 2068 break; 2069 case TargetOpcode::COPY: { 2070 // GPR32 copies will by lowered to ORRXrs 2071 Register DstReg = MI.getOperand(0).getReg(); 2072 return (AArch64::GPR32RegClass.contains(DstReg) || 2073 AArch64::GPR64RegClass.contains(DstReg)); 2074 } 2075 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2076 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2077 assert(MI.getDesc().getNumOperands() == 4 && 2078 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2079 return true; 2080 } 2081 break; 2082 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2083 if (MI.getOperand(2).getImm() == 0) { 2084 assert(MI.getDesc().getNumOperands() == 4 && 2085 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2086 return true; 2087 } 2088 break; 2089 } 2090 return false; 2091 } 2092 2093 // Return true if this instruction simply renames a general register without 2094 // modifying bits. 2095 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2096 switch (MI.getOpcode()) { 2097 default: 2098 break; 2099 case TargetOpcode::COPY: { 2100 Register DstReg = MI.getOperand(0).getReg(); 2101 return AArch64::FPR128RegClass.contains(DstReg); 2102 } 2103 case AArch64::ORRv16i8: 2104 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2105 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2106 "invalid ORRv16i8 operands"); 2107 return true; 2108 } 2109 break; 2110 } 2111 return false; 2112 } 2113 2114 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2115 int &FrameIndex) const { 2116 switch (MI.getOpcode()) { 2117 default: 2118 break; 2119 case AArch64::LDRWui: 2120 case AArch64::LDRXui: 2121 case AArch64::LDRBui: 2122 case AArch64::LDRHui: 2123 case AArch64::LDRSui: 2124 case AArch64::LDRDui: 2125 case AArch64::LDRQui: 2126 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2127 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2128 FrameIndex = MI.getOperand(1).getIndex(); 2129 return MI.getOperand(0).getReg(); 2130 } 2131 break; 2132 } 2133 2134 return 0; 2135 } 2136 2137 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2138 int &FrameIndex) const { 2139 switch (MI.getOpcode()) { 2140 default: 2141 break; 2142 case AArch64::STRWui: 2143 case AArch64::STRXui: 2144 case AArch64::STRBui: 2145 case AArch64::STRHui: 2146 case AArch64::STRSui: 2147 case AArch64::STRDui: 2148 case AArch64::STRQui: 2149 case AArch64::LDR_PXI: 2150 case AArch64::STR_PXI: 2151 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2152 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2153 FrameIndex = MI.getOperand(1).getIndex(); 2154 return MI.getOperand(0).getReg(); 2155 } 2156 break; 2157 } 2158 return 0; 2159 } 2160 2161 /// Check all MachineMemOperands for a hint to suppress pairing. 2162 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2163 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2164 return MMO->getFlags() & MOSuppressPair; 2165 }); 2166 } 2167 2168 /// Set a flag on the first MachineMemOperand to suppress pairing. 2169 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2170 if (MI.memoperands_empty()) 2171 return; 2172 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2173 } 2174 2175 /// Check all MachineMemOperands for a hint that the load/store is strided. 2176 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2177 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2178 return MMO->getFlags() & MOStridedAccess; 2179 }); 2180 } 2181 2182 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2183 switch (Opc) { 2184 default: 2185 return false; 2186 case AArch64::STURSi: 2187 case AArch64::STRSpre: 2188 case AArch64::STURDi: 2189 case AArch64::STRDpre: 2190 case AArch64::STURQi: 2191 case AArch64::STRQpre: 2192 case AArch64::STURBBi: 2193 case AArch64::STURHHi: 2194 case AArch64::STURWi: 2195 case AArch64::STRWpre: 2196 case AArch64::STURXi: 2197 case AArch64::STRXpre: 2198 case AArch64::LDURSi: 2199 case AArch64::LDRSpre: 2200 case AArch64::LDURDi: 2201 case AArch64::LDRDpre: 2202 case AArch64::LDURQi: 2203 case AArch64::LDRQpre: 2204 case AArch64::LDURWi: 2205 case AArch64::LDRWpre: 2206 case AArch64::LDURXi: 2207 case AArch64::LDRXpre: 2208 case AArch64::LDURSWi: 2209 case AArch64::LDURHHi: 2210 case AArch64::LDURBBi: 2211 case AArch64::LDURSBWi: 2212 case AArch64::LDURSHWi: 2213 return true; 2214 } 2215 } 2216 2217 std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2218 switch (Opc) { 2219 default: return {}; 2220 case AArch64::PRFMui: return AArch64::PRFUMi; 2221 case AArch64::LDRXui: return AArch64::LDURXi; 2222 case AArch64::LDRWui: return AArch64::LDURWi; 2223 case AArch64::LDRBui: return AArch64::LDURBi; 2224 case AArch64::LDRHui: return AArch64::LDURHi; 2225 case AArch64::LDRSui: return AArch64::LDURSi; 2226 case AArch64::LDRDui: return AArch64::LDURDi; 2227 case AArch64::LDRQui: return AArch64::LDURQi; 2228 case AArch64::LDRBBui: return AArch64::LDURBBi; 2229 case AArch64::LDRHHui: return AArch64::LDURHHi; 2230 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2231 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2232 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2233 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2234 case AArch64::LDRSWui: return AArch64::LDURSWi; 2235 case AArch64::STRXui: return AArch64::STURXi; 2236 case AArch64::STRWui: return AArch64::STURWi; 2237 case AArch64::STRBui: return AArch64::STURBi; 2238 case AArch64::STRHui: return AArch64::STURHi; 2239 case AArch64::STRSui: return AArch64::STURSi; 2240 case AArch64::STRDui: return AArch64::STURDi; 2241 case AArch64::STRQui: return AArch64::STURQi; 2242 case AArch64::STRBBui: return AArch64::STURBBi; 2243 case AArch64::STRHHui: return AArch64::STURHHi; 2244 } 2245 } 2246 2247 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2248 switch (Opc) { 2249 default: 2250 return 2; 2251 case AArch64::LDPXi: 2252 case AArch64::LDPDi: 2253 case AArch64::STPXi: 2254 case AArch64::STPDi: 2255 case AArch64::LDNPXi: 2256 case AArch64::LDNPDi: 2257 case AArch64::STNPXi: 2258 case AArch64::STNPDi: 2259 case AArch64::LDPQi: 2260 case AArch64::STPQi: 2261 case AArch64::LDNPQi: 2262 case AArch64::STNPQi: 2263 case AArch64::LDPWi: 2264 case AArch64::LDPSi: 2265 case AArch64::STPWi: 2266 case AArch64::STPSi: 2267 case AArch64::LDNPWi: 2268 case AArch64::LDNPSi: 2269 case AArch64::STNPWi: 2270 case AArch64::STNPSi: 2271 case AArch64::LDG: 2272 case AArch64::STGPi: 2273 2274 case AArch64::LD1B_IMM: 2275 case AArch64::LD1B_H_IMM: 2276 case AArch64::LD1B_S_IMM: 2277 case AArch64::LD1B_D_IMM: 2278 case AArch64::LD1SB_H_IMM: 2279 case AArch64::LD1SB_S_IMM: 2280 case AArch64::LD1SB_D_IMM: 2281 case AArch64::LD1H_IMM: 2282 case AArch64::LD1H_S_IMM: 2283 case AArch64::LD1H_D_IMM: 2284 case AArch64::LD1SH_S_IMM: 2285 case AArch64::LD1SH_D_IMM: 2286 case AArch64::LD1W_IMM: 2287 case AArch64::LD1W_D_IMM: 2288 case AArch64::LD1SW_D_IMM: 2289 case AArch64::LD1D_IMM: 2290 2291 case AArch64::LD2B_IMM: 2292 case AArch64::LD2H_IMM: 2293 case AArch64::LD2W_IMM: 2294 case AArch64::LD2D_IMM: 2295 case AArch64::LD3B_IMM: 2296 case AArch64::LD3H_IMM: 2297 case AArch64::LD3W_IMM: 2298 case AArch64::LD3D_IMM: 2299 case AArch64::LD4B_IMM: 2300 case AArch64::LD4H_IMM: 2301 case AArch64::LD4W_IMM: 2302 case AArch64::LD4D_IMM: 2303 2304 case AArch64::ST1B_IMM: 2305 case AArch64::ST1B_H_IMM: 2306 case AArch64::ST1B_S_IMM: 2307 case AArch64::ST1B_D_IMM: 2308 case AArch64::ST1H_IMM: 2309 case AArch64::ST1H_S_IMM: 2310 case AArch64::ST1H_D_IMM: 2311 case AArch64::ST1W_IMM: 2312 case AArch64::ST1W_D_IMM: 2313 case AArch64::ST1D_IMM: 2314 2315 case AArch64::ST2B_IMM: 2316 case AArch64::ST2H_IMM: 2317 case AArch64::ST2W_IMM: 2318 case AArch64::ST2D_IMM: 2319 case AArch64::ST3B_IMM: 2320 case AArch64::ST3H_IMM: 2321 case AArch64::ST3W_IMM: 2322 case AArch64::ST3D_IMM: 2323 case AArch64::ST4B_IMM: 2324 case AArch64::ST4H_IMM: 2325 case AArch64::ST4W_IMM: 2326 case AArch64::ST4D_IMM: 2327 2328 case AArch64::LD1RB_IMM: 2329 case AArch64::LD1RB_H_IMM: 2330 case AArch64::LD1RB_S_IMM: 2331 case AArch64::LD1RB_D_IMM: 2332 case AArch64::LD1RSB_H_IMM: 2333 case AArch64::LD1RSB_S_IMM: 2334 case AArch64::LD1RSB_D_IMM: 2335 case AArch64::LD1RH_IMM: 2336 case AArch64::LD1RH_S_IMM: 2337 case AArch64::LD1RH_D_IMM: 2338 case AArch64::LD1RSH_S_IMM: 2339 case AArch64::LD1RSH_D_IMM: 2340 case AArch64::LD1RW_IMM: 2341 case AArch64::LD1RW_D_IMM: 2342 case AArch64::LD1RSW_IMM: 2343 case AArch64::LD1RD_IMM: 2344 2345 case AArch64::LDNT1B_ZRI: 2346 case AArch64::LDNT1H_ZRI: 2347 case AArch64::LDNT1W_ZRI: 2348 case AArch64::LDNT1D_ZRI: 2349 case AArch64::STNT1B_ZRI: 2350 case AArch64::STNT1H_ZRI: 2351 case AArch64::STNT1W_ZRI: 2352 case AArch64::STNT1D_ZRI: 2353 2354 case AArch64::LDNF1B_IMM: 2355 case AArch64::LDNF1B_H_IMM: 2356 case AArch64::LDNF1B_S_IMM: 2357 case AArch64::LDNF1B_D_IMM: 2358 case AArch64::LDNF1SB_H_IMM: 2359 case AArch64::LDNF1SB_S_IMM: 2360 case AArch64::LDNF1SB_D_IMM: 2361 case AArch64::LDNF1H_IMM: 2362 case AArch64::LDNF1H_S_IMM: 2363 case AArch64::LDNF1H_D_IMM: 2364 case AArch64::LDNF1SH_S_IMM: 2365 case AArch64::LDNF1SH_D_IMM: 2366 case AArch64::LDNF1W_IMM: 2367 case AArch64::LDNF1W_D_IMM: 2368 case AArch64::LDNF1SW_D_IMM: 2369 case AArch64::LDNF1D_IMM: 2370 return 3; 2371 case AArch64::ADDG: 2372 case AArch64::STGOffset: 2373 case AArch64::LDR_PXI: 2374 case AArch64::STR_PXI: 2375 return 2; 2376 } 2377 } 2378 2379 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2380 switch (MI.getOpcode()) { 2381 default: 2382 return false; 2383 // Scaled instructions. 2384 case AArch64::STRSui: 2385 case AArch64::STRDui: 2386 case AArch64::STRQui: 2387 case AArch64::STRXui: 2388 case AArch64::STRWui: 2389 case AArch64::LDRSui: 2390 case AArch64::LDRDui: 2391 case AArch64::LDRQui: 2392 case AArch64::LDRXui: 2393 case AArch64::LDRWui: 2394 case AArch64::LDRSWui: 2395 // Unscaled instructions. 2396 case AArch64::STURSi: 2397 case AArch64::STRSpre: 2398 case AArch64::STURDi: 2399 case AArch64::STRDpre: 2400 case AArch64::STURQi: 2401 case AArch64::STRQpre: 2402 case AArch64::STURWi: 2403 case AArch64::STRWpre: 2404 case AArch64::STURXi: 2405 case AArch64::STRXpre: 2406 case AArch64::LDURSi: 2407 case AArch64::LDRSpre: 2408 case AArch64::LDURDi: 2409 case AArch64::LDRDpre: 2410 case AArch64::LDURQi: 2411 case AArch64::LDRQpre: 2412 case AArch64::LDURWi: 2413 case AArch64::LDRWpre: 2414 case AArch64::LDURXi: 2415 case AArch64::LDRXpre: 2416 case AArch64::LDURSWi: 2417 return true; 2418 } 2419 } 2420 2421 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) { 2422 switch (Opc) { 2423 default: 2424 llvm_unreachable("Opcode has no flag setting equivalent!"); 2425 // 32-bit cases: 2426 case AArch64::ADDWri: 2427 return AArch64::ADDSWri; 2428 case AArch64::ADDWrr: 2429 return AArch64::ADDSWrr; 2430 case AArch64::ADDWrs: 2431 return AArch64::ADDSWrs; 2432 case AArch64::ADDWrx: 2433 return AArch64::ADDSWrx; 2434 case AArch64::ANDWri: 2435 return AArch64::ANDSWri; 2436 case AArch64::ANDWrr: 2437 return AArch64::ANDSWrr; 2438 case AArch64::ANDWrs: 2439 return AArch64::ANDSWrs; 2440 case AArch64::BICWrr: 2441 return AArch64::BICSWrr; 2442 case AArch64::BICWrs: 2443 return AArch64::BICSWrs; 2444 case AArch64::SUBWri: 2445 return AArch64::SUBSWri; 2446 case AArch64::SUBWrr: 2447 return AArch64::SUBSWrr; 2448 case AArch64::SUBWrs: 2449 return AArch64::SUBSWrs; 2450 case AArch64::SUBWrx: 2451 return AArch64::SUBSWrx; 2452 // 64-bit cases: 2453 case AArch64::ADDXri: 2454 return AArch64::ADDSXri; 2455 case AArch64::ADDXrr: 2456 return AArch64::ADDSXrr; 2457 case AArch64::ADDXrs: 2458 return AArch64::ADDSXrs; 2459 case AArch64::ADDXrx: 2460 return AArch64::ADDSXrx; 2461 case AArch64::ANDXri: 2462 return AArch64::ANDSXri; 2463 case AArch64::ANDXrr: 2464 return AArch64::ANDSXrr; 2465 case AArch64::ANDXrs: 2466 return AArch64::ANDSXrs; 2467 case AArch64::BICXrr: 2468 return AArch64::BICSXrr; 2469 case AArch64::BICXrs: 2470 return AArch64::BICSXrs; 2471 case AArch64::SUBXri: 2472 return AArch64::SUBSXri; 2473 case AArch64::SUBXrr: 2474 return AArch64::SUBSXrr; 2475 case AArch64::SUBXrs: 2476 return AArch64::SUBSXrs; 2477 case AArch64::SUBXrx: 2478 return AArch64::SUBSXrx; 2479 // SVE instructions: 2480 case AArch64::AND_PPzPP: 2481 return AArch64::ANDS_PPzPP; 2482 case AArch64::BIC_PPzPP: 2483 return AArch64::BICS_PPzPP; 2484 case AArch64::EOR_PPzPP: 2485 return AArch64::EORS_PPzPP; 2486 case AArch64::NAND_PPzPP: 2487 return AArch64::NANDS_PPzPP; 2488 case AArch64::NOR_PPzPP: 2489 return AArch64::NORS_PPzPP; 2490 case AArch64::ORN_PPzPP: 2491 return AArch64::ORNS_PPzPP; 2492 case AArch64::ORR_PPzPP: 2493 return AArch64::ORRS_PPzPP; 2494 case AArch64::BRKA_PPzP: 2495 return AArch64::BRKAS_PPzP; 2496 case AArch64::BRKPA_PPzPP: 2497 return AArch64::BRKPAS_PPzPP; 2498 case AArch64::BRKB_PPzP: 2499 return AArch64::BRKBS_PPzP; 2500 case AArch64::BRKPB_PPzPP: 2501 return AArch64::BRKPBS_PPzPP; 2502 case AArch64::BRKN_PPzP: 2503 return AArch64::BRKNS_PPzP; 2504 case AArch64::RDFFR_PPz: 2505 return AArch64::RDFFRS_PPz; 2506 case AArch64::PTRUE_B: 2507 return AArch64::PTRUES_B; 2508 } 2509 } 2510 2511 // Is this a candidate for ld/st merging or pairing? For example, we don't 2512 // touch volatiles or load/stores that have a hint to avoid pair formation. 2513 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2514 2515 bool IsPreLdSt = isPreLdSt(MI); 2516 2517 // If this is a volatile load/store, don't mess with it. 2518 if (MI.hasOrderedMemoryRef()) 2519 return false; 2520 2521 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2522 // For Pre-inc LD/ST, the operand is shifted by one. 2523 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2524 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2525 "Expected a reg or frame index operand."); 2526 2527 // For Pre-indexed addressing quadword instructions, the third operand is the 2528 // immediate value. 2529 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2530 2531 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2532 return false; 2533 2534 // Can't merge/pair if the instruction modifies the base register. 2535 // e.g., ldr x0, [x0] 2536 // This case will never occur with an FI base. 2537 // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged. 2538 // For example: 2539 // ldr q0, [x11, #32]! 2540 // ldr q1, [x11, #16] 2541 // to 2542 // ldp q0, q1, [x11, #32]! 2543 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2544 Register BaseReg = MI.getOperand(1).getReg(); 2545 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2546 if (MI.modifiesRegister(BaseReg, TRI)) 2547 return false; 2548 } 2549 2550 // Check if this load/store has a hint to avoid pair formation. 2551 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2552 if (isLdStPairSuppressed(MI)) 2553 return false; 2554 2555 // Do not pair any callee-save store/reload instructions in the 2556 // prologue/epilogue if the CFI information encoded the operations as separate 2557 // instructions, as that will cause the size of the actual prologue to mismatch 2558 // with the prologue size recorded in the Windows CFI. 2559 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2560 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2561 MI.getMF()->getFunction().needsUnwindTableEntry(); 2562 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2563 MI.getFlag(MachineInstr::FrameDestroy))) 2564 return false; 2565 2566 // On some CPUs quad load/store pairs are slower than two single load/stores. 2567 if (Subtarget.isPaired128Slow()) { 2568 switch (MI.getOpcode()) { 2569 default: 2570 break; 2571 case AArch64::LDURQi: 2572 case AArch64::STURQi: 2573 case AArch64::LDRQui: 2574 case AArch64::STRQui: 2575 return false; 2576 } 2577 } 2578 2579 return true; 2580 } 2581 2582 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2583 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2584 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2585 const TargetRegisterInfo *TRI) const { 2586 if (!LdSt.mayLoadOrStore()) 2587 return false; 2588 2589 const MachineOperand *BaseOp; 2590 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2591 Width, TRI)) 2592 return false; 2593 BaseOps.push_back(BaseOp); 2594 return true; 2595 } 2596 2597 std::optional<ExtAddrMode> 2598 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2599 const TargetRegisterInfo *TRI) const { 2600 const MachineOperand *Base; // Filled with the base operand of MI. 2601 int64_t Offset; // Filled with the offset of MI. 2602 bool OffsetIsScalable; 2603 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2604 return std::nullopt; 2605 2606 if (!Base->isReg()) 2607 return std::nullopt; 2608 ExtAddrMode AM; 2609 AM.BaseReg = Base->getReg(); 2610 AM.Displacement = Offset; 2611 AM.ScaledReg = 0; 2612 AM.Scale = 0; 2613 return AM; 2614 } 2615 2616 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2617 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2618 bool &OffsetIsScalable, unsigned &Width, 2619 const TargetRegisterInfo *TRI) const { 2620 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2621 // Handle only loads/stores with base register followed by immediate offset. 2622 if (LdSt.getNumExplicitOperands() == 3) { 2623 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2624 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2625 !LdSt.getOperand(2).isImm()) 2626 return false; 2627 } else if (LdSt.getNumExplicitOperands() == 4) { 2628 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2629 if (!LdSt.getOperand(1).isReg() || 2630 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2631 !LdSt.getOperand(3).isImm()) 2632 return false; 2633 } else 2634 return false; 2635 2636 // Get the scaling factor for the instruction and set the width for the 2637 // instruction. 2638 TypeSize Scale(0U, false); 2639 int64_t Dummy1, Dummy2; 2640 2641 // If this returns false, then it's an instruction we don't want to handle. 2642 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2643 return false; 2644 2645 // Compute the offset. Offset is calculated as the immediate operand 2646 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2647 // set to 1. 2648 if (LdSt.getNumExplicitOperands() == 3) { 2649 BaseOp = &LdSt.getOperand(1); 2650 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinValue(); 2651 } else { 2652 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2653 BaseOp = &LdSt.getOperand(2); 2654 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinValue(); 2655 } 2656 OffsetIsScalable = Scale.isScalable(); 2657 2658 if (!BaseOp->isReg() && !BaseOp->isFI()) 2659 return false; 2660 2661 return true; 2662 } 2663 2664 MachineOperand & 2665 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2666 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2667 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2668 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2669 return OfsOp; 2670 } 2671 2672 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2673 unsigned &Width, int64_t &MinOffset, 2674 int64_t &MaxOffset) { 2675 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2676 switch (Opcode) { 2677 // Not a memory operation or something we want to handle. 2678 default: 2679 Scale = TypeSize::Fixed(0); 2680 Width = 0; 2681 MinOffset = MaxOffset = 0; 2682 return false; 2683 case AArch64::STRWpost: 2684 case AArch64::LDRWpost: 2685 Width = 32; 2686 Scale = TypeSize::Fixed(4); 2687 MinOffset = -256; 2688 MaxOffset = 255; 2689 break; 2690 case AArch64::LDURQi: 2691 case AArch64::STURQi: 2692 Width = 16; 2693 Scale = TypeSize::Fixed(1); 2694 MinOffset = -256; 2695 MaxOffset = 255; 2696 break; 2697 case AArch64::PRFUMi: 2698 case AArch64::LDURXi: 2699 case AArch64::LDURDi: 2700 case AArch64::STURXi: 2701 case AArch64::STURDi: 2702 Width = 8; 2703 Scale = TypeSize::Fixed(1); 2704 MinOffset = -256; 2705 MaxOffset = 255; 2706 break; 2707 case AArch64::LDURWi: 2708 case AArch64::LDURSi: 2709 case AArch64::LDURSWi: 2710 case AArch64::STURWi: 2711 case AArch64::STURSi: 2712 Width = 4; 2713 Scale = TypeSize::Fixed(1); 2714 MinOffset = -256; 2715 MaxOffset = 255; 2716 break; 2717 case AArch64::LDURHi: 2718 case AArch64::LDURHHi: 2719 case AArch64::LDURSHXi: 2720 case AArch64::LDURSHWi: 2721 case AArch64::STURHi: 2722 case AArch64::STURHHi: 2723 Width = 2; 2724 Scale = TypeSize::Fixed(1); 2725 MinOffset = -256; 2726 MaxOffset = 255; 2727 break; 2728 case AArch64::LDURBi: 2729 case AArch64::LDURBBi: 2730 case AArch64::LDURSBXi: 2731 case AArch64::LDURSBWi: 2732 case AArch64::STURBi: 2733 case AArch64::STURBBi: 2734 Width = 1; 2735 Scale = TypeSize::Fixed(1); 2736 MinOffset = -256; 2737 MaxOffset = 255; 2738 break; 2739 case AArch64::LDPQi: 2740 case AArch64::LDNPQi: 2741 case AArch64::STPQi: 2742 case AArch64::STNPQi: 2743 Scale = TypeSize::Fixed(16); 2744 Width = 32; 2745 MinOffset = -64; 2746 MaxOffset = 63; 2747 break; 2748 case AArch64::LDRQui: 2749 case AArch64::STRQui: 2750 Scale = TypeSize::Fixed(16); 2751 Width = 16; 2752 MinOffset = 0; 2753 MaxOffset = 4095; 2754 break; 2755 case AArch64::LDPXi: 2756 case AArch64::LDPDi: 2757 case AArch64::LDNPXi: 2758 case AArch64::LDNPDi: 2759 case AArch64::STPXi: 2760 case AArch64::STPDi: 2761 case AArch64::STNPXi: 2762 case AArch64::STNPDi: 2763 Scale = TypeSize::Fixed(8); 2764 Width = 16; 2765 MinOffset = -64; 2766 MaxOffset = 63; 2767 break; 2768 case AArch64::PRFMui: 2769 case AArch64::LDRXui: 2770 case AArch64::LDRDui: 2771 case AArch64::STRXui: 2772 case AArch64::STRDui: 2773 Scale = TypeSize::Fixed(8); 2774 Width = 8; 2775 MinOffset = 0; 2776 MaxOffset = 4095; 2777 break; 2778 case AArch64::StoreSwiftAsyncContext: 2779 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2780 Scale = TypeSize::Fixed(1); 2781 Width = 8; 2782 MinOffset = 0; 2783 MaxOffset = 4095; 2784 break; 2785 case AArch64::LDPWi: 2786 case AArch64::LDPSi: 2787 case AArch64::LDNPWi: 2788 case AArch64::LDNPSi: 2789 case AArch64::STPWi: 2790 case AArch64::STPSi: 2791 case AArch64::STNPWi: 2792 case AArch64::STNPSi: 2793 Scale = TypeSize::Fixed(4); 2794 Width = 8; 2795 MinOffset = -64; 2796 MaxOffset = 63; 2797 break; 2798 case AArch64::LDRWui: 2799 case AArch64::LDRSui: 2800 case AArch64::LDRSWui: 2801 case AArch64::STRWui: 2802 case AArch64::STRSui: 2803 Scale = TypeSize::Fixed(4); 2804 Width = 4; 2805 MinOffset = 0; 2806 MaxOffset = 4095; 2807 break; 2808 case AArch64::LDRHui: 2809 case AArch64::LDRHHui: 2810 case AArch64::LDRSHWui: 2811 case AArch64::LDRSHXui: 2812 case AArch64::STRHui: 2813 case AArch64::STRHHui: 2814 Scale = TypeSize::Fixed(2); 2815 Width = 2; 2816 MinOffset = 0; 2817 MaxOffset = 4095; 2818 break; 2819 case AArch64::LDRBui: 2820 case AArch64::LDRBBui: 2821 case AArch64::LDRSBWui: 2822 case AArch64::LDRSBXui: 2823 case AArch64::STRBui: 2824 case AArch64::STRBBui: 2825 Scale = TypeSize::Fixed(1); 2826 Width = 1; 2827 MinOffset = 0; 2828 MaxOffset = 4095; 2829 break; 2830 case AArch64::STPXpre: 2831 case AArch64::LDPXpost: 2832 case AArch64::STPDpre: 2833 case AArch64::LDPDpost: 2834 Scale = TypeSize::Fixed(8); 2835 Width = 8; 2836 MinOffset = -512; 2837 MaxOffset = 504; 2838 break; 2839 case AArch64::STPQpre: 2840 case AArch64::LDPQpost: 2841 Scale = TypeSize::Fixed(16); 2842 Width = 16; 2843 MinOffset = -1024; 2844 MaxOffset = 1008; 2845 break; 2846 case AArch64::STRXpre: 2847 case AArch64::STRDpre: 2848 case AArch64::LDRXpost: 2849 case AArch64::LDRDpost: 2850 Scale = TypeSize::Fixed(1); 2851 Width = 8; 2852 MinOffset = -256; 2853 MaxOffset = 255; 2854 break; 2855 case AArch64::STRQpre: 2856 case AArch64::LDRQpost: 2857 Scale = TypeSize::Fixed(1); 2858 Width = 16; 2859 MinOffset = -256; 2860 MaxOffset = 255; 2861 break; 2862 case AArch64::ADDG: 2863 Scale = TypeSize::Fixed(16); 2864 Width = 0; 2865 MinOffset = 0; 2866 MaxOffset = 63; 2867 break; 2868 case AArch64::TAGPstack: 2869 Scale = TypeSize::Fixed(16); 2870 Width = 0; 2871 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2872 // of 63 (not 64!). 2873 MinOffset = -63; 2874 MaxOffset = 63; 2875 break; 2876 case AArch64::LDG: 2877 case AArch64::STGOffset: 2878 case AArch64::STZGOffset: 2879 Scale = TypeSize::Fixed(16); 2880 Width = 16; 2881 MinOffset = -256; 2882 MaxOffset = 255; 2883 break; 2884 case AArch64::STR_ZZZZXI: 2885 case AArch64::LDR_ZZZZXI: 2886 Scale = TypeSize::Scalable(16); 2887 Width = SVEMaxBytesPerVector * 4; 2888 MinOffset = -256; 2889 MaxOffset = 252; 2890 break; 2891 case AArch64::STR_ZZZXI: 2892 case AArch64::LDR_ZZZXI: 2893 Scale = TypeSize::Scalable(16); 2894 Width = SVEMaxBytesPerVector * 3; 2895 MinOffset = -256; 2896 MaxOffset = 253; 2897 break; 2898 case AArch64::STR_ZZXI: 2899 case AArch64::LDR_ZZXI: 2900 Scale = TypeSize::Scalable(16); 2901 Width = SVEMaxBytesPerVector * 2; 2902 MinOffset = -256; 2903 MaxOffset = 254; 2904 break; 2905 case AArch64::LDR_PXI: 2906 case AArch64::STR_PXI: 2907 Scale = TypeSize::Scalable(2); 2908 Width = SVEMaxBytesPerVector / 8; 2909 MinOffset = -256; 2910 MaxOffset = 255; 2911 break; 2912 case AArch64::LDR_ZXI: 2913 case AArch64::STR_ZXI: 2914 Scale = TypeSize::Scalable(16); 2915 Width = SVEMaxBytesPerVector; 2916 MinOffset = -256; 2917 MaxOffset = 255; 2918 break; 2919 case AArch64::LD1B_IMM: 2920 case AArch64::LD1H_IMM: 2921 case AArch64::LD1W_IMM: 2922 case AArch64::LD1D_IMM: 2923 case AArch64::LDNT1B_ZRI: 2924 case AArch64::LDNT1H_ZRI: 2925 case AArch64::LDNT1W_ZRI: 2926 case AArch64::LDNT1D_ZRI: 2927 case AArch64::ST1B_IMM: 2928 case AArch64::ST1H_IMM: 2929 case AArch64::ST1W_IMM: 2930 case AArch64::ST1D_IMM: 2931 case AArch64::STNT1B_ZRI: 2932 case AArch64::STNT1H_ZRI: 2933 case AArch64::STNT1W_ZRI: 2934 case AArch64::STNT1D_ZRI: 2935 case AArch64::LDNF1B_IMM: 2936 case AArch64::LDNF1H_IMM: 2937 case AArch64::LDNF1W_IMM: 2938 case AArch64::LDNF1D_IMM: 2939 // A full vectors worth of data 2940 // Width = mbytes * elements 2941 Scale = TypeSize::Scalable(16); 2942 Width = SVEMaxBytesPerVector; 2943 MinOffset = -8; 2944 MaxOffset = 7; 2945 break; 2946 case AArch64::LD2B_IMM: 2947 case AArch64::LD2H_IMM: 2948 case AArch64::LD2W_IMM: 2949 case AArch64::LD2D_IMM: 2950 case AArch64::ST2B_IMM: 2951 case AArch64::ST2H_IMM: 2952 case AArch64::ST2W_IMM: 2953 case AArch64::ST2D_IMM: 2954 Scale = TypeSize::Scalable(32); 2955 Width = SVEMaxBytesPerVector * 2; 2956 MinOffset = -8; 2957 MaxOffset = 7; 2958 break; 2959 case AArch64::LD3B_IMM: 2960 case AArch64::LD3H_IMM: 2961 case AArch64::LD3W_IMM: 2962 case AArch64::LD3D_IMM: 2963 case AArch64::ST3B_IMM: 2964 case AArch64::ST3H_IMM: 2965 case AArch64::ST3W_IMM: 2966 case AArch64::ST3D_IMM: 2967 Scale = TypeSize::Scalable(48); 2968 Width = SVEMaxBytesPerVector * 3; 2969 MinOffset = -8; 2970 MaxOffset = 7; 2971 break; 2972 case AArch64::LD4B_IMM: 2973 case AArch64::LD4H_IMM: 2974 case AArch64::LD4W_IMM: 2975 case AArch64::LD4D_IMM: 2976 case AArch64::ST4B_IMM: 2977 case AArch64::ST4H_IMM: 2978 case AArch64::ST4W_IMM: 2979 case AArch64::ST4D_IMM: 2980 Scale = TypeSize::Scalable(64); 2981 Width = SVEMaxBytesPerVector * 4; 2982 MinOffset = -8; 2983 MaxOffset = 7; 2984 break; 2985 case AArch64::LD1B_H_IMM: 2986 case AArch64::LD1SB_H_IMM: 2987 case AArch64::LD1H_S_IMM: 2988 case AArch64::LD1SH_S_IMM: 2989 case AArch64::LD1W_D_IMM: 2990 case AArch64::LD1SW_D_IMM: 2991 case AArch64::ST1B_H_IMM: 2992 case AArch64::ST1H_S_IMM: 2993 case AArch64::ST1W_D_IMM: 2994 case AArch64::LDNF1B_H_IMM: 2995 case AArch64::LDNF1SB_H_IMM: 2996 case AArch64::LDNF1H_S_IMM: 2997 case AArch64::LDNF1SH_S_IMM: 2998 case AArch64::LDNF1W_D_IMM: 2999 case AArch64::LDNF1SW_D_IMM: 3000 // A half vector worth of data 3001 // Width = mbytes * elements 3002 Scale = TypeSize::Scalable(8); 3003 Width = SVEMaxBytesPerVector / 2; 3004 MinOffset = -8; 3005 MaxOffset = 7; 3006 break; 3007 case AArch64::LD1B_S_IMM: 3008 case AArch64::LD1SB_S_IMM: 3009 case AArch64::LD1H_D_IMM: 3010 case AArch64::LD1SH_D_IMM: 3011 case AArch64::ST1B_S_IMM: 3012 case AArch64::ST1H_D_IMM: 3013 case AArch64::LDNF1B_S_IMM: 3014 case AArch64::LDNF1SB_S_IMM: 3015 case AArch64::LDNF1H_D_IMM: 3016 case AArch64::LDNF1SH_D_IMM: 3017 // A quarter vector worth of data 3018 // Width = mbytes * elements 3019 Scale = TypeSize::Scalable(4); 3020 Width = SVEMaxBytesPerVector / 4; 3021 MinOffset = -8; 3022 MaxOffset = 7; 3023 break; 3024 case AArch64::LD1B_D_IMM: 3025 case AArch64::LD1SB_D_IMM: 3026 case AArch64::ST1B_D_IMM: 3027 case AArch64::LDNF1B_D_IMM: 3028 case AArch64::LDNF1SB_D_IMM: 3029 // A eighth vector worth of data 3030 // Width = mbytes * elements 3031 Scale = TypeSize::Scalable(2); 3032 Width = SVEMaxBytesPerVector / 8; 3033 MinOffset = -8; 3034 MaxOffset = 7; 3035 break; 3036 case AArch64::ST2GOffset: 3037 case AArch64::STZ2GOffset: 3038 Scale = TypeSize::Fixed(16); 3039 Width = 32; 3040 MinOffset = -256; 3041 MaxOffset = 255; 3042 break; 3043 case AArch64::STGPi: 3044 Scale = TypeSize::Fixed(16); 3045 Width = 16; 3046 MinOffset = -64; 3047 MaxOffset = 63; 3048 break; 3049 case AArch64::LD1RB_IMM: 3050 case AArch64::LD1RB_H_IMM: 3051 case AArch64::LD1RB_S_IMM: 3052 case AArch64::LD1RB_D_IMM: 3053 case AArch64::LD1RSB_H_IMM: 3054 case AArch64::LD1RSB_S_IMM: 3055 case AArch64::LD1RSB_D_IMM: 3056 Scale = TypeSize::Fixed(1); 3057 Width = 1; 3058 MinOffset = 0; 3059 MaxOffset = 63; 3060 break; 3061 case AArch64::LD1RH_IMM: 3062 case AArch64::LD1RH_S_IMM: 3063 case AArch64::LD1RH_D_IMM: 3064 case AArch64::LD1RSH_S_IMM: 3065 case AArch64::LD1RSH_D_IMM: 3066 Scale = TypeSize::Fixed(2); 3067 Width = 2; 3068 MinOffset = 0; 3069 MaxOffset = 63; 3070 break; 3071 case AArch64::LD1RW_IMM: 3072 case AArch64::LD1RW_D_IMM: 3073 case AArch64::LD1RSW_IMM: 3074 Scale = TypeSize::Fixed(4); 3075 Width = 4; 3076 MinOffset = 0; 3077 MaxOffset = 63; 3078 break; 3079 case AArch64::LD1RD_IMM: 3080 Scale = TypeSize::Fixed(8); 3081 Width = 8; 3082 MinOffset = 0; 3083 MaxOffset = 63; 3084 break; 3085 } 3086 3087 return true; 3088 } 3089 3090 // Scaling factor for unscaled load or store. 3091 int AArch64InstrInfo::getMemScale(unsigned Opc) { 3092 switch (Opc) { 3093 default: 3094 llvm_unreachable("Opcode has unknown scale!"); 3095 case AArch64::LDRBBui: 3096 case AArch64::LDURBBi: 3097 case AArch64::LDRSBWui: 3098 case AArch64::LDURSBWi: 3099 case AArch64::STRBBui: 3100 case AArch64::STURBBi: 3101 return 1; 3102 case AArch64::LDRHHui: 3103 case AArch64::LDURHHi: 3104 case AArch64::LDRSHWui: 3105 case AArch64::LDURSHWi: 3106 case AArch64::STRHHui: 3107 case AArch64::STURHHi: 3108 return 2; 3109 case AArch64::LDRSui: 3110 case AArch64::LDURSi: 3111 case AArch64::LDRSpre: 3112 case AArch64::LDRSWui: 3113 case AArch64::LDURSWi: 3114 case AArch64::LDRWpre: 3115 case AArch64::LDRWui: 3116 case AArch64::LDURWi: 3117 case AArch64::STRSui: 3118 case AArch64::STURSi: 3119 case AArch64::STRSpre: 3120 case AArch64::STRWui: 3121 case AArch64::STURWi: 3122 case AArch64::STRWpre: 3123 case AArch64::LDPSi: 3124 case AArch64::LDPSWi: 3125 case AArch64::LDPWi: 3126 case AArch64::STPSi: 3127 case AArch64::STPWi: 3128 return 4; 3129 case AArch64::LDRDui: 3130 case AArch64::LDURDi: 3131 case AArch64::LDRDpre: 3132 case AArch64::LDRXui: 3133 case AArch64::LDURXi: 3134 case AArch64::LDRXpre: 3135 case AArch64::STRDui: 3136 case AArch64::STURDi: 3137 case AArch64::STRDpre: 3138 case AArch64::STRXui: 3139 case AArch64::STURXi: 3140 case AArch64::STRXpre: 3141 case AArch64::LDPDi: 3142 case AArch64::LDPXi: 3143 case AArch64::STPDi: 3144 case AArch64::STPXi: 3145 return 8; 3146 case AArch64::LDRQui: 3147 case AArch64::LDURQi: 3148 case AArch64::STRQui: 3149 case AArch64::STURQi: 3150 case AArch64::STRQpre: 3151 case AArch64::LDPQi: 3152 case AArch64::LDRQpre: 3153 case AArch64::STPQi: 3154 case AArch64::STGOffset: 3155 case AArch64::STZGOffset: 3156 case AArch64::ST2GOffset: 3157 case AArch64::STZ2GOffset: 3158 case AArch64::STGPi: 3159 return 16; 3160 } 3161 } 3162 3163 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 3164 switch (MI.getOpcode()) { 3165 default: 3166 return false; 3167 case AArch64::LDRWpre: 3168 case AArch64::LDRXpre: 3169 case AArch64::LDRSpre: 3170 case AArch64::LDRDpre: 3171 case AArch64::LDRQpre: 3172 return true; 3173 } 3174 } 3175 3176 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3177 switch (MI.getOpcode()) { 3178 default: 3179 return false; 3180 case AArch64::STRWpre: 3181 case AArch64::STRXpre: 3182 case AArch64::STRSpre: 3183 case AArch64::STRDpre: 3184 case AArch64::STRQpre: 3185 return true; 3186 } 3187 } 3188 3189 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3190 return isPreLd(MI) || isPreSt(MI); 3191 } 3192 3193 bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { 3194 switch (MI.getOpcode()) { 3195 default: 3196 return false; 3197 case AArch64::LDPSi: 3198 case AArch64::LDPSWi: 3199 case AArch64::LDPDi: 3200 case AArch64::LDPQi: 3201 case AArch64::LDPWi: 3202 case AArch64::LDPXi: 3203 case AArch64::STPSi: 3204 case AArch64::STPDi: 3205 case AArch64::STPQi: 3206 case AArch64::STPWi: 3207 case AArch64::STPXi: 3208 case AArch64::STGPi: 3209 return true; 3210 } 3211 } 3212 3213 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) { 3214 unsigned Idx = 3215 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 3216 : 1; 3217 return MI.getOperand(Idx); 3218 } 3219 3220 const MachineOperand & 3221 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) { 3222 unsigned Idx = 3223 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 3224 : 2; 3225 return MI.getOperand(Idx); 3226 } 3227 3228 static const TargetRegisterClass *getRegClass(const MachineInstr &MI, 3229 Register Reg) { 3230 if (MI.getParent() == nullptr) 3231 return nullptr; 3232 const MachineFunction *MF = MI.getParent()->getParent(); 3233 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr; 3234 } 3235 3236 bool AArch64InstrInfo::isQForm(const MachineInstr &MI) { 3237 auto IsQFPR = [&](const MachineOperand &Op) { 3238 if (!Op.isReg()) 3239 return false; 3240 auto Reg = Op.getReg(); 3241 if (Reg.isPhysical()) 3242 return AArch64::FPR128RegClass.contains(Reg); 3243 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3244 return TRC == &AArch64::FPR128RegClass || 3245 TRC == &AArch64::FPR128_loRegClass; 3246 }; 3247 return llvm::any_of(MI.operands(), IsQFPR); 3248 } 3249 3250 bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) { 3251 auto IsFPR = [&](const MachineOperand &Op) { 3252 if (!Op.isReg()) 3253 return false; 3254 auto Reg = Op.getReg(); 3255 if (Reg.isPhysical()) 3256 return AArch64::FPR128RegClass.contains(Reg) || 3257 AArch64::FPR64RegClass.contains(Reg) || 3258 AArch64::FPR32RegClass.contains(Reg) || 3259 AArch64::FPR16RegClass.contains(Reg) || 3260 AArch64::FPR8RegClass.contains(Reg); 3261 3262 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg); 3263 return TRC == &AArch64::FPR128RegClass || 3264 TRC == &AArch64::FPR128_loRegClass || 3265 TRC == &AArch64::FPR64RegClass || 3266 TRC == &AArch64::FPR64_loRegClass || 3267 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass || 3268 TRC == &AArch64::FPR8RegClass; 3269 }; 3270 return llvm::any_of(MI.operands(), IsFPR); 3271 } 3272 3273 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3274 // scaled. 3275 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3276 int Scale = AArch64InstrInfo::getMemScale(Opc); 3277 3278 // If the byte-offset isn't a multiple of the stride, we can't scale this 3279 // offset. 3280 if (Offset % Scale != 0) 3281 return false; 3282 3283 // Convert the byte-offset used by unscaled into an "element" offset used 3284 // by the scaled pair load/store instructions. 3285 Offset /= Scale; 3286 return true; 3287 } 3288 3289 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3290 if (FirstOpc == SecondOpc) 3291 return true; 3292 // We can also pair sign-ext and zero-ext instructions. 3293 switch (FirstOpc) { 3294 default: 3295 return false; 3296 case AArch64::LDRWui: 3297 case AArch64::LDURWi: 3298 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3299 case AArch64::LDRSWui: 3300 case AArch64::LDURSWi: 3301 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3302 } 3303 // These instructions can't be paired based on their opcodes. 3304 return false; 3305 } 3306 3307 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3308 int64_t Offset1, unsigned Opcode1, int FI2, 3309 int64_t Offset2, unsigned Opcode2) { 3310 // Accesses through fixed stack object frame indices may access a different 3311 // fixed stack slot. Check that the object offsets + offsets match. 3312 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3313 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3314 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3315 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3316 // Convert to scaled object offsets. 3317 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3318 if (ObjectOffset1 % Scale1 != 0) 3319 return false; 3320 ObjectOffset1 /= Scale1; 3321 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3322 if (ObjectOffset2 % Scale2 != 0) 3323 return false; 3324 ObjectOffset2 /= Scale2; 3325 ObjectOffset1 += Offset1; 3326 ObjectOffset2 += Offset2; 3327 return ObjectOffset1 + 1 == ObjectOffset2; 3328 } 3329 3330 return FI1 == FI2; 3331 } 3332 3333 /// Detect opportunities for ldp/stp formation. 3334 /// 3335 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3336 bool AArch64InstrInfo::shouldClusterMemOps( 3337 ArrayRef<const MachineOperand *> BaseOps1, 3338 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3339 unsigned NumBytes) const { 3340 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3341 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3342 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3343 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3344 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3345 if (BaseOp1.getType() != BaseOp2.getType()) 3346 return false; 3347 3348 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3349 "Only base registers and frame indices are supported."); 3350 3351 // Check for both base regs and base FI. 3352 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3353 return false; 3354 3355 // Only cluster up to a single pair. 3356 if (NumLoads > 2) 3357 return false; 3358 3359 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3360 return false; 3361 3362 // Can we pair these instructions based on their opcodes? 3363 unsigned FirstOpc = FirstLdSt.getOpcode(); 3364 unsigned SecondOpc = SecondLdSt.getOpcode(); 3365 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3366 return false; 3367 3368 // Can't merge volatiles or load/stores that have a hint to avoid pair 3369 // formation, for example. 3370 if (!isCandidateToMergeOrPair(FirstLdSt) || 3371 !isCandidateToMergeOrPair(SecondLdSt)) 3372 return false; 3373 3374 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3375 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3376 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3377 return false; 3378 3379 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3380 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3381 return false; 3382 3383 // Pairwise instructions have a 7-bit signed offset field. 3384 if (Offset1 > 63 || Offset1 < -64) 3385 return false; 3386 3387 // The caller should already have ordered First/SecondLdSt by offset. 3388 // Note: except for non-equal frame index bases 3389 if (BaseOp1.isFI()) { 3390 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3391 "Caller should have ordered offsets."); 3392 3393 const MachineFrameInfo &MFI = 3394 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3395 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3396 BaseOp2.getIndex(), Offset2, SecondOpc); 3397 } 3398 3399 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3400 3401 return Offset1 + 1 == Offset2; 3402 } 3403 3404 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3405 unsigned Reg, unsigned SubIdx, 3406 unsigned State, 3407 const TargetRegisterInfo *TRI) { 3408 if (!SubIdx) 3409 return MIB.addReg(Reg, State); 3410 3411 if (Register::isPhysicalRegister(Reg)) 3412 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3413 return MIB.addReg(Reg, State, SubIdx); 3414 } 3415 3416 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3417 unsigned NumRegs) { 3418 // We really want the positive remainder mod 32 here, that happens to be 3419 // easily obtainable with a mask. 3420 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3421 } 3422 3423 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3424 MachineBasicBlock::iterator I, 3425 const DebugLoc &DL, MCRegister DestReg, 3426 MCRegister SrcReg, bool KillSrc, 3427 unsigned Opcode, 3428 ArrayRef<unsigned> Indices) const { 3429 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3430 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3431 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3432 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3433 unsigned NumRegs = Indices.size(); 3434 3435 int SubReg = 0, End = NumRegs, Incr = 1; 3436 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3437 SubReg = NumRegs - 1; 3438 End = -1; 3439 Incr = -1; 3440 } 3441 3442 for (; SubReg != End; SubReg += Incr) { 3443 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3444 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3445 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3446 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3447 } 3448 } 3449 3450 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3451 MachineBasicBlock::iterator I, 3452 DebugLoc DL, unsigned DestReg, 3453 unsigned SrcReg, bool KillSrc, 3454 unsigned Opcode, unsigned ZeroReg, 3455 llvm::ArrayRef<unsigned> Indices) const { 3456 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3457 unsigned NumRegs = Indices.size(); 3458 3459 #ifndef NDEBUG 3460 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3461 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3462 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3463 "GPR reg sequences should not be able to overlap"); 3464 #endif 3465 3466 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3467 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3468 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3469 MIB.addReg(ZeroReg); 3470 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3471 MIB.addImm(0); 3472 } 3473 } 3474 3475 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3476 MachineBasicBlock::iterator I, 3477 const DebugLoc &DL, MCRegister DestReg, 3478 MCRegister SrcReg, bool KillSrc) const { 3479 if (AArch64::GPR32spRegClass.contains(DestReg) && 3480 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3481 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3482 3483 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3484 // If either operand is WSP, expand to ADD #0. 3485 if (Subtarget.hasZeroCycleRegMove()) { 3486 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3487 MCRegister DestRegX = TRI->getMatchingSuperReg( 3488 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3489 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3490 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3491 // This instruction is reading and writing X registers. This may upset 3492 // the register scavenger and machine verifier, so we need to indicate 3493 // that we are reading an undefined value from SrcRegX, but a proper 3494 // value from SrcReg. 3495 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3496 .addReg(SrcRegX, RegState::Undef) 3497 .addImm(0) 3498 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3499 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3500 } else { 3501 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3502 .addReg(SrcReg, getKillRegState(KillSrc)) 3503 .addImm(0) 3504 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3505 } 3506 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3507 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3508 .addImm(0) 3509 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3510 } else { 3511 if (Subtarget.hasZeroCycleRegMove()) { 3512 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3513 MCRegister DestRegX = TRI->getMatchingSuperReg( 3514 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3515 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3516 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3517 // This instruction is reading and writing X registers. This may upset 3518 // the register scavenger and machine verifier, so we need to indicate 3519 // that we are reading an undefined value from SrcRegX, but a proper 3520 // value from SrcReg. 3521 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3522 .addReg(AArch64::XZR) 3523 .addReg(SrcRegX, RegState::Undef) 3524 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3525 } else { 3526 // Otherwise, expand to ORR WZR. 3527 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3528 .addReg(AArch64::WZR) 3529 .addReg(SrcReg, getKillRegState(KillSrc)); 3530 } 3531 } 3532 return; 3533 } 3534 3535 // Copy a Predicate register by ORRing with itself. 3536 if (AArch64::PPRRegClass.contains(DestReg) && 3537 AArch64::PPRRegClass.contains(SrcReg)) { 3538 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3539 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3540 .addReg(SrcReg) // Pg 3541 .addReg(SrcReg) 3542 .addReg(SrcReg, getKillRegState(KillSrc)); 3543 return; 3544 } 3545 3546 // Copy a Z register by ORRing with itself. 3547 if (AArch64::ZPRRegClass.contains(DestReg) && 3548 AArch64::ZPRRegClass.contains(SrcReg)) { 3549 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3550 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3551 .addReg(SrcReg) 3552 .addReg(SrcReg, getKillRegState(KillSrc)); 3553 return; 3554 } 3555 3556 // Copy a Z register pair by copying the individual sub-registers. 3557 if (AArch64::ZPR2RegClass.contains(DestReg) && 3558 AArch64::ZPR2RegClass.contains(SrcReg)) { 3559 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3560 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3561 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3562 Indices); 3563 return; 3564 } 3565 3566 // Copy a Z register triple by copying the individual sub-registers. 3567 if (AArch64::ZPR3RegClass.contains(DestReg) && 3568 AArch64::ZPR3RegClass.contains(SrcReg)) { 3569 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3570 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3571 AArch64::zsub2}; 3572 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3573 Indices); 3574 return; 3575 } 3576 3577 // Copy a Z register quad by copying the individual sub-registers. 3578 if (AArch64::ZPR4RegClass.contains(DestReg) && 3579 AArch64::ZPR4RegClass.contains(SrcReg)) { 3580 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register."); 3581 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3582 AArch64::zsub2, AArch64::zsub3}; 3583 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3584 Indices); 3585 return; 3586 } 3587 3588 if (AArch64::GPR64spRegClass.contains(DestReg) && 3589 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3590 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3591 // If either operand is SP, expand to ADD #0. 3592 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3593 .addReg(SrcReg, getKillRegState(KillSrc)) 3594 .addImm(0) 3595 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3596 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3597 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3598 .addImm(0) 3599 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3600 } else { 3601 // Otherwise, expand to ORR XZR. 3602 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3603 .addReg(AArch64::XZR) 3604 .addReg(SrcReg, getKillRegState(KillSrc)); 3605 } 3606 return; 3607 } 3608 3609 // Copy a DDDD register quad by copying the individual sub-registers. 3610 if (AArch64::DDDDRegClass.contains(DestReg) && 3611 AArch64::DDDDRegClass.contains(SrcReg)) { 3612 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3613 AArch64::dsub2, AArch64::dsub3}; 3614 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3615 Indices); 3616 return; 3617 } 3618 3619 // Copy a DDD register triple by copying the individual sub-registers. 3620 if (AArch64::DDDRegClass.contains(DestReg) && 3621 AArch64::DDDRegClass.contains(SrcReg)) { 3622 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3623 AArch64::dsub2}; 3624 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3625 Indices); 3626 return; 3627 } 3628 3629 // Copy a DD register pair by copying the individual sub-registers. 3630 if (AArch64::DDRegClass.contains(DestReg) && 3631 AArch64::DDRegClass.contains(SrcReg)) { 3632 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3633 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3634 Indices); 3635 return; 3636 } 3637 3638 // Copy a QQQQ register quad by copying the individual sub-registers. 3639 if (AArch64::QQQQRegClass.contains(DestReg) && 3640 AArch64::QQQQRegClass.contains(SrcReg)) { 3641 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3642 AArch64::qsub2, AArch64::qsub3}; 3643 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3644 Indices); 3645 return; 3646 } 3647 3648 // Copy a QQQ register triple by copying the individual sub-registers. 3649 if (AArch64::QQQRegClass.contains(DestReg) && 3650 AArch64::QQQRegClass.contains(SrcReg)) { 3651 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3652 AArch64::qsub2}; 3653 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3654 Indices); 3655 return; 3656 } 3657 3658 // Copy a QQ register pair by copying the individual sub-registers. 3659 if (AArch64::QQRegClass.contains(DestReg) && 3660 AArch64::QQRegClass.contains(SrcReg)) { 3661 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3662 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3663 Indices); 3664 return; 3665 } 3666 3667 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3668 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3669 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3670 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3671 AArch64::XZR, Indices); 3672 return; 3673 } 3674 3675 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3676 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3677 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3678 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3679 AArch64::WZR, Indices); 3680 return; 3681 } 3682 3683 if (AArch64::FPR128RegClass.contains(DestReg) && 3684 AArch64::FPR128RegClass.contains(SrcReg)) { 3685 if (Subtarget.forceStreamingCompatibleSVE()) { 3686 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ)) 3687 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define) 3688 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)) 3689 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0)); 3690 } else if (Subtarget.hasNEON()) { 3691 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3692 .addReg(SrcReg) 3693 .addReg(SrcReg, getKillRegState(KillSrc)); 3694 } else { 3695 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3696 .addReg(AArch64::SP, RegState::Define) 3697 .addReg(SrcReg, getKillRegState(KillSrc)) 3698 .addReg(AArch64::SP) 3699 .addImm(-16); 3700 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3701 .addReg(AArch64::SP, RegState::Define) 3702 .addReg(DestReg, RegState::Define) 3703 .addReg(AArch64::SP) 3704 .addImm(16); 3705 } 3706 return; 3707 } 3708 3709 if (AArch64::FPR64RegClass.contains(DestReg) && 3710 AArch64::FPR64RegClass.contains(SrcReg)) { 3711 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3712 .addReg(SrcReg, getKillRegState(KillSrc)); 3713 return; 3714 } 3715 3716 if (AArch64::FPR32RegClass.contains(DestReg) && 3717 AArch64::FPR32RegClass.contains(SrcReg)) { 3718 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3719 .addReg(SrcReg, getKillRegState(KillSrc)); 3720 return; 3721 } 3722 3723 if (AArch64::FPR16RegClass.contains(DestReg) && 3724 AArch64::FPR16RegClass.contains(SrcReg)) { 3725 DestReg = 3726 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); 3727 SrcReg = 3728 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); 3729 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3730 .addReg(SrcReg, getKillRegState(KillSrc)); 3731 return; 3732 } 3733 3734 if (AArch64::FPR8RegClass.contains(DestReg) && 3735 AArch64::FPR8RegClass.contains(SrcReg)) { 3736 DestReg = 3737 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); 3738 SrcReg = 3739 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); 3740 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3741 .addReg(SrcReg, getKillRegState(KillSrc)); 3742 return; 3743 } 3744 3745 // Copies between GPR64 and FPR64. 3746 if (AArch64::FPR64RegClass.contains(DestReg) && 3747 AArch64::GPR64RegClass.contains(SrcReg)) { 3748 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3749 .addReg(SrcReg, getKillRegState(KillSrc)); 3750 return; 3751 } 3752 if (AArch64::GPR64RegClass.contains(DestReg) && 3753 AArch64::FPR64RegClass.contains(SrcReg)) { 3754 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3755 .addReg(SrcReg, getKillRegState(KillSrc)); 3756 return; 3757 } 3758 // Copies between GPR32 and FPR32. 3759 if (AArch64::FPR32RegClass.contains(DestReg) && 3760 AArch64::GPR32RegClass.contains(SrcReg)) { 3761 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3762 .addReg(SrcReg, getKillRegState(KillSrc)); 3763 return; 3764 } 3765 if (AArch64::GPR32RegClass.contains(DestReg) && 3766 AArch64::FPR32RegClass.contains(SrcReg)) { 3767 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3768 .addReg(SrcReg, getKillRegState(KillSrc)); 3769 return; 3770 } 3771 3772 if (DestReg == AArch64::NZCV) { 3773 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3774 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3775 .addImm(AArch64SysReg::NZCV) 3776 .addReg(SrcReg, getKillRegState(KillSrc)) 3777 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3778 return; 3779 } 3780 3781 if (SrcReg == AArch64::NZCV) { 3782 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3783 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3784 .addImm(AArch64SysReg::NZCV) 3785 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3786 return; 3787 } 3788 3789 #ifndef NDEBUG 3790 const TargetRegisterInfo &TRI = getRegisterInfo(); 3791 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 3792 << TRI.getRegAsmName(SrcReg) << "\n"; 3793 #endif 3794 llvm_unreachable("unimplemented reg-to-reg copy"); 3795 } 3796 3797 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3798 MachineBasicBlock &MBB, 3799 MachineBasicBlock::iterator InsertBefore, 3800 const MCInstrDesc &MCID, 3801 Register SrcReg, bool IsKill, 3802 unsigned SubIdx0, unsigned SubIdx1, int FI, 3803 MachineMemOperand *MMO) { 3804 Register SrcReg0 = SrcReg; 3805 Register SrcReg1 = SrcReg; 3806 if (SrcReg.isPhysical()) { 3807 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3808 SubIdx0 = 0; 3809 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3810 SubIdx1 = 0; 3811 } 3812 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3813 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3814 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3815 .addFrameIndex(FI) 3816 .addImm(0) 3817 .addMemOperand(MMO); 3818 } 3819 3820 void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 3821 MachineBasicBlock::iterator MBBI, 3822 Register SrcReg, bool isKill, int FI, 3823 const TargetRegisterClass *RC, 3824 const TargetRegisterInfo *TRI, 3825 Register VReg) const { 3826 MachineFunction &MF = *MBB.getParent(); 3827 MachineFrameInfo &MFI = MF.getFrameInfo(); 3828 3829 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3830 MachineMemOperand *MMO = 3831 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3832 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3833 unsigned Opc = 0; 3834 bool Offset = true; 3835 unsigned StackID = TargetStackID::Default; 3836 switch (TRI->getSpillSize(*RC)) { 3837 case 1: 3838 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3839 Opc = AArch64::STRBui; 3840 break; 3841 case 2: 3842 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3843 Opc = AArch64::STRHui; 3844 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3845 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3846 Opc = AArch64::STR_PXI; 3847 StackID = TargetStackID::ScalableVector; 3848 } 3849 break; 3850 case 4: 3851 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3852 Opc = AArch64::STRWui; 3853 if (SrcReg.isVirtual()) 3854 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3855 else 3856 assert(SrcReg != AArch64::WSP); 3857 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3858 Opc = AArch64::STRSui; 3859 break; 3860 case 8: 3861 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3862 Opc = AArch64::STRXui; 3863 if (SrcReg.isVirtual()) 3864 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3865 else 3866 assert(SrcReg != AArch64::SP); 3867 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3868 Opc = AArch64::STRDui; 3869 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3870 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3871 get(AArch64::STPWi), SrcReg, isKill, 3872 AArch64::sube32, AArch64::subo32, FI, MMO); 3873 return; 3874 } 3875 break; 3876 case 16: 3877 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3878 Opc = AArch64::STRQui; 3879 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3880 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3881 Opc = AArch64::ST1Twov1d; 3882 Offset = false; 3883 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3884 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3885 get(AArch64::STPXi), SrcReg, isKill, 3886 AArch64::sube64, AArch64::subo64, FI, MMO); 3887 return; 3888 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3889 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3890 Opc = AArch64::STR_ZXI; 3891 StackID = TargetStackID::ScalableVector; 3892 } 3893 break; 3894 case 24: 3895 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3896 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3897 Opc = AArch64::ST1Threev1d; 3898 Offset = false; 3899 } 3900 break; 3901 case 32: 3902 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3903 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3904 Opc = AArch64::ST1Fourv1d; 3905 Offset = false; 3906 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3907 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3908 Opc = AArch64::ST1Twov2d; 3909 Offset = false; 3910 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3911 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3912 Opc = AArch64::STR_ZZXI; 3913 StackID = TargetStackID::ScalableVector; 3914 } 3915 break; 3916 case 48: 3917 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3918 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3919 Opc = AArch64::ST1Threev2d; 3920 Offset = false; 3921 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3922 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3923 Opc = AArch64::STR_ZZZXI; 3924 StackID = TargetStackID::ScalableVector; 3925 } 3926 break; 3927 case 64: 3928 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3929 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3930 Opc = AArch64::ST1Fourv2d; 3931 Offset = false; 3932 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3933 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3934 Opc = AArch64::STR_ZZZZXI; 3935 StackID = TargetStackID::ScalableVector; 3936 } 3937 break; 3938 } 3939 assert(Opc && "Unknown register class"); 3940 MFI.setStackID(FI, StackID); 3941 3942 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3943 .addReg(SrcReg, getKillRegState(isKill)) 3944 .addFrameIndex(FI); 3945 3946 if (Offset) 3947 MI.addImm(0); 3948 MI.addMemOperand(MMO); 3949 } 3950 3951 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3952 MachineBasicBlock &MBB, 3953 MachineBasicBlock::iterator InsertBefore, 3954 const MCInstrDesc &MCID, 3955 Register DestReg, unsigned SubIdx0, 3956 unsigned SubIdx1, int FI, 3957 MachineMemOperand *MMO) { 3958 Register DestReg0 = DestReg; 3959 Register DestReg1 = DestReg; 3960 bool IsUndef = true; 3961 if (DestReg.isPhysical()) { 3962 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3963 SubIdx0 = 0; 3964 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3965 SubIdx1 = 0; 3966 IsUndef = false; 3967 } 3968 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3969 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3970 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3971 .addFrameIndex(FI) 3972 .addImm(0) 3973 .addMemOperand(MMO); 3974 } 3975 3976 void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 3977 MachineBasicBlock::iterator MBBI, 3978 Register DestReg, int FI, 3979 const TargetRegisterClass *RC, 3980 const TargetRegisterInfo *TRI, 3981 Register VReg) const { 3982 MachineFunction &MF = *MBB.getParent(); 3983 MachineFrameInfo &MFI = MF.getFrameInfo(); 3984 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3985 MachineMemOperand *MMO = 3986 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3987 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3988 3989 unsigned Opc = 0; 3990 bool Offset = true; 3991 unsigned StackID = TargetStackID::Default; 3992 switch (TRI->getSpillSize(*RC)) { 3993 case 1: 3994 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3995 Opc = AArch64::LDRBui; 3996 break; 3997 case 2: 3998 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3999 Opc = AArch64::LDRHui; 4000 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 4001 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4002 Opc = AArch64::LDR_PXI; 4003 StackID = TargetStackID::ScalableVector; 4004 } 4005 break; 4006 case 4: 4007 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 4008 Opc = AArch64::LDRWui; 4009 if (DestReg.isVirtual()) 4010 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 4011 else 4012 assert(DestReg != AArch64::WSP); 4013 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 4014 Opc = AArch64::LDRSui; 4015 break; 4016 case 8: 4017 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 4018 Opc = AArch64::LDRXui; 4019 if (DestReg.isVirtual()) 4020 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 4021 else 4022 assert(DestReg != AArch64::SP); 4023 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 4024 Opc = AArch64::LDRDui; 4025 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 4026 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4027 get(AArch64::LDPWi), DestReg, AArch64::sube32, 4028 AArch64::subo32, FI, MMO); 4029 return; 4030 } 4031 break; 4032 case 16: 4033 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 4034 Opc = AArch64::LDRQui; 4035 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 4036 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4037 Opc = AArch64::LD1Twov1d; 4038 Offset = false; 4039 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 4040 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 4041 get(AArch64::LDPXi), DestReg, AArch64::sube64, 4042 AArch64::subo64, FI, MMO); 4043 return; 4044 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 4045 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4046 Opc = AArch64::LDR_ZXI; 4047 StackID = TargetStackID::ScalableVector; 4048 } 4049 break; 4050 case 24: 4051 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 4052 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4053 Opc = AArch64::LD1Threev1d; 4054 Offset = false; 4055 } 4056 break; 4057 case 32: 4058 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 4059 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4060 Opc = AArch64::LD1Fourv1d; 4061 Offset = false; 4062 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 4063 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4064 Opc = AArch64::LD1Twov2d; 4065 Offset = false; 4066 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 4067 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4068 Opc = AArch64::LDR_ZZXI; 4069 StackID = TargetStackID::ScalableVector; 4070 } 4071 break; 4072 case 48: 4073 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 4074 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4075 Opc = AArch64::LD1Threev2d; 4076 Offset = false; 4077 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 4078 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4079 Opc = AArch64::LDR_ZZZXI; 4080 StackID = TargetStackID::ScalableVector; 4081 } 4082 break; 4083 case 64: 4084 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 4085 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 4086 Opc = AArch64::LD1Fourv2d; 4087 Offset = false; 4088 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 4089 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 4090 Opc = AArch64::LDR_ZZZZXI; 4091 StackID = TargetStackID::ScalableVector; 4092 } 4093 break; 4094 } 4095 4096 assert(Opc && "Unknown register class"); 4097 MFI.setStackID(FI, StackID); 4098 4099 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 4100 .addReg(DestReg, getDefRegState(true)) 4101 .addFrameIndex(FI); 4102 if (Offset) 4103 MI.addImm(0); 4104 MI.addMemOperand(MMO); 4105 } 4106 4107 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 4108 const MachineInstr &UseMI, 4109 const TargetRegisterInfo *TRI) { 4110 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 4111 UseMI.getIterator()), 4112 [TRI](const MachineInstr &I) { 4113 return I.modifiesRegister(AArch64::NZCV, TRI) || 4114 I.readsRegister(AArch64::NZCV, TRI); 4115 }); 4116 } 4117 4118 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4119 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 4120 // The smallest scalable element supported by scaled SVE addressing 4121 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4122 // byte offset must always be a multiple of 2. 4123 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4124 4125 // VGSized offsets are divided by '2', because the VG register is the 4126 // the number of 64bit granules as opposed to 128bit vector chunks, 4127 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 4128 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 4129 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 4130 ByteSized = Offset.getFixed(); 4131 VGSized = Offset.getScalable() / 2; 4132 } 4133 4134 /// Returns the offset in parts to which this frame offset can be 4135 /// decomposed for the purpose of describing a frame offset. 4136 /// For non-scalable offsets this is simply its byte size. 4137 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4138 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 4139 int64_t &NumDataVectors) { 4140 // The smallest scalable element supported by scaled SVE addressing 4141 // modes are predicates, which are 2 scalable bytes in size. So the scalable 4142 // byte offset must always be a multiple of 2. 4143 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 4144 4145 NumBytes = Offset.getFixed(); 4146 NumDataVectors = 0; 4147 NumPredicateVectors = Offset.getScalable() / 2; 4148 // This method is used to get the offsets to adjust the frame offset. 4149 // If the function requires ADDPL to be used and needs more than two ADDPL 4150 // instructions, part of the offset is folded into NumDataVectors so that it 4151 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 4152 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 4153 NumPredicateVectors > 62) { 4154 NumDataVectors = NumPredicateVectors / 8; 4155 NumPredicateVectors -= NumDataVectors * 8; 4156 } 4157 } 4158 4159 // Convenience function to create a DWARF expression for 4160 // Expr + NumBytes + NumVGScaledBytes * AArch64::VG 4161 static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes, 4162 int NumVGScaledBytes, unsigned VG, 4163 llvm::raw_string_ostream &Comment) { 4164 uint8_t buffer[16]; 4165 4166 if (NumBytes) { 4167 Expr.push_back(dwarf::DW_OP_consts); 4168 Expr.append(buffer, buffer + encodeSLEB128(NumBytes, buffer)); 4169 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4170 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(NumBytes); 4171 } 4172 4173 if (NumVGScaledBytes) { 4174 Expr.push_back((uint8_t)dwarf::DW_OP_consts); 4175 Expr.append(buffer, buffer + encodeSLEB128(NumVGScaledBytes, buffer)); 4176 4177 Expr.push_back((uint8_t)dwarf::DW_OP_bregx); 4178 Expr.append(buffer, buffer + encodeULEB128(VG, buffer)); 4179 Expr.push_back(0); 4180 4181 Expr.push_back((uint8_t)dwarf::DW_OP_mul); 4182 Expr.push_back((uint8_t)dwarf::DW_OP_plus); 4183 4184 Comment << (NumVGScaledBytes < 0 ? " - " : " + ") 4185 << std::abs(NumVGScaledBytes) << " * VG"; 4186 } 4187 } 4188 4189 // Creates an MCCFIInstruction: 4190 // { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr } 4191 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, 4192 unsigned Reg, 4193 const StackOffset &Offset) { 4194 int64_t NumBytes, NumVGScaledBytes; 4195 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, NumBytes, 4196 NumVGScaledBytes); 4197 std::string CommentBuffer; 4198 llvm::raw_string_ostream Comment(CommentBuffer); 4199 4200 if (Reg == AArch64::SP) 4201 Comment << "sp"; 4202 else if (Reg == AArch64::FP) 4203 Comment << "fp"; 4204 else 4205 Comment << printReg(Reg, &TRI); 4206 4207 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG) 4208 SmallString<64> Expr; 4209 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4210 Expr.push_back((uint8_t)(dwarf::DW_OP_breg0 + DwarfReg)); 4211 Expr.push_back(0); 4212 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes, 4213 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4214 4215 // Wrap this into DW_CFA_def_cfa. 4216 SmallString<64> DefCfaExpr; 4217 DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); 4218 uint8_t buffer[16]; 4219 DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); 4220 DefCfaExpr.append(Expr.str()); 4221 return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), 4222 Comment.str()); 4223 } 4224 4225 MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI, 4226 unsigned FrameReg, unsigned Reg, 4227 const StackOffset &Offset, 4228 bool LastAdjustmentWasScalable) { 4229 if (Offset.getScalable()) 4230 return createDefCFAExpression(TRI, Reg, Offset); 4231 4232 if (FrameReg == Reg && !LastAdjustmentWasScalable) 4233 return MCCFIInstruction::cfiDefCfaOffset(nullptr, int(Offset.getFixed())); 4234 4235 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4236 return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed()); 4237 } 4238 4239 MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI, 4240 unsigned Reg, 4241 const StackOffset &OffsetFromDefCFA) { 4242 int64_t NumBytes, NumVGScaledBytes; 4243 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 4244 OffsetFromDefCFA, NumBytes, NumVGScaledBytes); 4245 4246 unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); 4247 4248 // Non-scalable offsets can use DW_CFA_offset directly. 4249 if (!NumVGScaledBytes) 4250 return MCCFIInstruction::createOffset(nullptr, DwarfReg, NumBytes); 4251 4252 std::string CommentBuffer; 4253 llvm::raw_string_ostream Comment(CommentBuffer); 4254 Comment << printReg(Reg, &TRI) << " @ cfa"; 4255 4256 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG) 4257 SmallString<64> OffsetExpr; 4258 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes, 4259 TRI.getDwarfRegNum(AArch64::VG, true), Comment); 4260 4261 // Wrap this into DW_CFA_expression 4262 SmallString<64> CfaExpr; 4263 CfaExpr.push_back(dwarf::DW_CFA_expression); 4264 uint8_t buffer[16]; 4265 CfaExpr.append(buffer, buffer + encodeULEB128(DwarfReg, buffer)); 4266 CfaExpr.append(buffer, buffer + encodeULEB128(OffsetExpr.size(), buffer)); 4267 CfaExpr.append(OffsetExpr.str()); 4268 4269 return MCCFIInstruction::createEscape(nullptr, CfaExpr.str(), Comment.str()); 4270 } 4271 4272 // Helper function to emit a frame offset adjustment from a given 4273 // pointer (SrcReg), stored into DestReg. This function is explicit 4274 // in that it requires the opcode. 4275 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 4276 MachineBasicBlock::iterator MBBI, 4277 const DebugLoc &DL, unsigned DestReg, 4278 unsigned SrcReg, int64_t Offset, unsigned Opc, 4279 const TargetInstrInfo *TII, 4280 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 4281 bool *HasWinCFI, bool EmitCFAOffset, 4282 StackOffset CFAOffset, unsigned FrameReg) { 4283 int Sign = 1; 4284 unsigned MaxEncoding, ShiftSize; 4285 switch (Opc) { 4286 case AArch64::ADDXri: 4287 case AArch64::ADDSXri: 4288 case AArch64::SUBXri: 4289 case AArch64::SUBSXri: 4290 MaxEncoding = 0xfff; 4291 ShiftSize = 12; 4292 break; 4293 case AArch64::ADDVL_XXI: 4294 case AArch64::ADDPL_XXI: 4295 case AArch64::ADDSVL_XXI: 4296 case AArch64::ADDSPL_XXI: 4297 MaxEncoding = 31; 4298 ShiftSize = 0; 4299 if (Offset < 0) { 4300 MaxEncoding = 32; 4301 Sign = -1; 4302 Offset = -Offset; 4303 } 4304 break; 4305 default: 4306 llvm_unreachable("Unsupported opcode"); 4307 } 4308 4309 // `Offset` can be in bytes or in "scalable bytes". 4310 int VScale = 1; 4311 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) 4312 VScale = 16; 4313 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) 4314 VScale = 2; 4315 4316 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 4317 // scratch register. If DestReg is a virtual register, use it as the 4318 // scratch register; otherwise, create a new virtual register (to be 4319 // replaced by the scavenger at the end of PEI). That case can be optimized 4320 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 4321 // register can be loaded with offset%8 and the add/sub can use an extending 4322 // instruction with LSL#3. 4323 // Currently the function handles any offsets but generates a poor sequence 4324 // of code. 4325 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 4326 4327 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 4328 Register TmpReg = DestReg; 4329 if (TmpReg == AArch64::XZR) 4330 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 4331 &AArch64::GPR64RegClass); 4332 do { 4333 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 4334 unsigned LocalShiftSize = 0; 4335 if (ThisVal > MaxEncoding) { 4336 ThisVal = ThisVal >> ShiftSize; 4337 LocalShiftSize = ShiftSize; 4338 } 4339 assert((ThisVal >> ShiftSize) <= MaxEncoding && 4340 "Encoding cannot handle value that big"); 4341 4342 Offset -= ThisVal << LocalShiftSize; 4343 if (Offset == 0) 4344 TmpReg = DestReg; 4345 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 4346 .addReg(SrcReg) 4347 .addImm(Sign * (int)ThisVal); 4348 if (ShiftSize) 4349 MBI = MBI.addImm( 4350 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4351 MBI = MBI.setMIFlag(Flag); 4352 4353 auto Change = 4354 VScale == 1 4355 ? StackOffset::getFixed(ThisVal << LocalShiftSize) 4356 : StackOffset::getScalable(VScale * (ThisVal << LocalShiftSize)); 4357 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri) 4358 CFAOffset += Change; 4359 else 4360 CFAOffset -= Change; 4361 if (EmitCFAOffset && DestReg == TmpReg) { 4362 MachineFunction &MF = *MBB.getParent(); 4363 const TargetSubtargetInfo &STI = MF.getSubtarget(); 4364 const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); 4365 4366 unsigned CFIIndex = MF.addFrameInst( 4367 createDefCFA(TRI, FrameReg, DestReg, CFAOffset, VScale != 1)); 4368 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 4369 .addCFIIndex(CFIIndex) 4370 .setMIFlags(Flag); 4371 } 4372 4373 if (NeedsWinCFI) { 4374 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4375 int Imm = (int)(ThisVal << LocalShiftSize); 4376 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4377 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4378 if (HasWinCFI) 4379 *HasWinCFI = true; 4380 if (Imm == 0) 4381 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4382 else 4383 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4384 .addImm(Imm) 4385 .setMIFlag(Flag); 4386 assert(Offset == 0 && "Expected remaining offset to be zero to " 4387 "emit a single SEH directive"); 4388 } else if (DestReg == AArch64::SP) { 4389 if (HasWinCFI) 4390 *HasWinCFI = true; 4391 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4392 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4393 .addImm(Imm) 4394 .setMIFlag(Flag); 4395 } 4396 } 4397 4398 SrcReg = TmpReg; 4399 } while (Offset); 4400 } 4401 4402 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4403 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4404 unsigned DestReg, unsigned SrcReg, 4405 StackOffset Offset, const TargetInstrInfo *TII, 4406 MachineInstr::MIFlag Flag, bool SetNZCV, 4407 bool NeedsWinCFI, bool *HasWinCFI, 4408 bool EmitCFAOffset, StackOffset CFAOffset, 4409 unsigned FrameReg) { 4410 // If a function is marked as arm_locally_streaming, then the runtime value of 4411 // vscale in the prologue/epilogue is different the runtime value of vscale 4412 // in the function's body. To avoid having to consider multiple vscales, 4413 // we can use `addsvl` to allocate any scalable stack-slots, which under 4414 // most circumstances will be only locals, not callee-save slots. 4415 const Function &F = MBB.getParent()->getFunction(); 4416 bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); 4417 4418 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4419 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4420 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4421 4422 // First emit non-scalable frame offsets, or a simple 'mov'. 4423 if (Bytes || (!Offset && SrcReg != DestReg)) { 4424 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4425 "SP increment/decrement not 8-byte aligned"); 4426 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4427 if (Bytes < 0) { 4428 Bytes = -Bytes; 4429 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4430 } 4431 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4432 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset, 4433 FrameReg); 4434 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri) 4435 ? StackOffset::getFixed(-Bytes) 4436 : StackOffset::getFixed(Bytes); 4437 SrcReg = DestReg; 4438 FrameReg = DestReg; 4439 } 4440 4441 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4442 "SetNZCV not supported with SVE vectors"); 4443 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4444 "WinCFI not supported with SVE vectors"); 4445 4446 if (NumDataVectors) { 4447 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4448 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, 4449 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4450 CFAOffset, FrameReg); 4451 CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); 4452 SrcReg = DestReg; 4453 } 4454 4455 if (NumPredicateVectors) { 4456 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4457 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4458 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, 4459 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, 4460 CFAOffset, FrameReg); 4461 } 4462 } 4463 4464 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4465 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4466 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4467 LiveIntervals *LIS, VirtRegMap *VRM) const { 4468 // This is a bit of a hack. Consider this instruction: 4469 // 4470 // %0 = COPY %sp; GPR64all:%0 4471 // 4472 // We explicitly chose GPR64all for the virtual register so such a copy might 4473 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4474 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4475 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4476 // 4477 // To prevent that, we are going to constrain the %0 register class here. 4478 // 4479 // <rdar://problem/11522048> 4480 // 4481 if (MI.isFullCopy()) { 4482 Register DstReg = MI.getOperand(0).getReg(); 4483 Register SrcReg = MI.getOperand(1).getReg(); 4484 if (SrcReg == AArch64::SP && DstReg.isVirtual()) { 4485 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4486 return nullptr; 4487 } 4488 if (DstReg == AArch64::SP && SrcReg.isVirtual()) { 4489 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4490 return nullptr; 4491 } 4492 // Nothing can folded with copy from/to NZCV. 4493 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV) 4494 return nullptr; 4495 } 4496 4497 // Handle the case where a copy is being spilled or filled but the source 4498 // and destination register class don't match. For example: 4499 // 4500 // %0 = COPY %xzr; GPR64common:%0 4501 // 4502 // In this case we can still safely fold away the COPY and generate the 4503 // following spill code: 4504 // 4505 // STRXui %xzr, %stack.0 4506 // 4507 // This also eliminates spilled cross register class COPYs (e.g. between x and 4508 // d regs) of the same size. For example: 4509 // 4510 // %0 = COPY %1; GPR64:%0, FPR64:%1 4511 // 4512 // will be filled as 4513 // 4514 // LDRDui %0, fi<#0> 4515 // 4516 // instead of 4517 // 4518 // LDRXui %Temp, fi<#0> 4519 // %0 = FMOV %Temp 4520 // 4521 if (MI.isCopy() && Ops.size() == 1 && 4522 // Make sure we're only folding the explicit COPY defs/uses. 4523 (Ops[0] == 0 || Ops[0] == 1)) { 4524 bool IsSpill = Ops[0] == 0; 4525 bool IsFill = !IsSpill; 4526 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4527 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4528 MachineBasicBlock &MBB = *MI.getParent(); 4529 const MachineOperand &DstMO = MI.getOperand(0); 4530 const MachineOperand &SrcMO = MI.getOperand(1); 4531 Register DstReg = DstMO.getReg(); 4532 Register SrcReg = SrcMO.getReg(); 4533 // This is slightly expensive to compute for physical regs since 4534 // getMinimalPhysRegClass is slow. 4535 auto getRegClass = [&](unsigned Reg) { 4536 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4537 : TRI.getMinimalPhysRegClass(Reg); 4538 }; 4539 4540 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4541 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4542 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4543 "Mismatched register size in non subreg COPY"); 4544 if (IsSpill) 4545 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4546 getRegClass(SrcReg), &TRI, Register()); 4547 else 4548 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4549 getRegClass(DstReg), &TRI, Register()); 4550 return &*--InsertPt; 4551 } 4552 4553 // Handle cases like spilling def of: 4554 // 4555 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4556 // 4557 // where the physical register source can be widened and stored to the full 4558 // virtual reg destination stack slot, in this case producing: 4559 // 4560 // STRXui %xzr, %stack.0 4561 // 4562 if (IsSpill && DstMO.isUndef() && SrcReg.isPhysical()) { 4563 assert(SrcMO.getSubReg() == 0 && 4564 "Unexpected subreg on physical register"); 4565 const TargetRegisterClass *SpillRC; 4566 unsigned SpillSubreg; 4567 switch (DstMO.getSubReg()) { 4568 default: 4569 SpillRC = nullptr; 4570 break; 4571 case AArch64::sub_32: 4572 case AArch64::ssub: 4573 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4574 SpillRC = &AArch64::GPR64RegClass; 4575 SpillSubreg = AArch64::sub_32; 4576 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4577 SpillRC = &AArch64::FPR64RegClass; 4578 SpillSubreg = AArch64::ssub; 4579 } else 4580 SpillRC = nullptr; 4581 break; 4582 case AArch64::dsub: 4583 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4584 SpillRC = &AArch64::FPR128RegClass; 4585 SpillSubreg = AArch64::dsub; 4586 } else 4587 SpillRC = nullptr; 4588 break; 4589 } 4590 4591 if (SpillRC) 4592 if (unsigned WidenedSrcReg = 4593 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4594 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4595 FrameIndex, SpillRC, &TRI, Register()); 4596 return &*--InsertPt; 4597 } 4598 } 4599 4600 // Handle cases like filling use of: 4601 // 4602 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4603 // 4604 // where we can load the full virtual reg source stack slot, into the subreg 4605 // destination, in this case producing: 4606 // 4607 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4608 // 4609 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4610 const TargetRegisterClass *FillRC; 4611 switch (DstMO.getSubReg()) { 4612 default: 4613 FillRC = nullptr; 4614 break; 4615 case AArch64::sub_32: 4616 FillRC = &AArch64::GPR32RegClass; 4617 break; 4618 case AArch64::ssub: 4619 FillRC = &AArch64::FPR32RegClass; 4620 break; 4621 case AArch64::dsub: 4622 FillRC = &AArch64::FPR64RegClass; 4623 break; 4624 } 4625 4626 if (FillRC) { 4627 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4628 TRI.getRegSizeInBits(*FillRC) && 4629 "Mismatched regclass size on folded subreg COPY"); 4630 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, 4631 Register()); 4632 MachineInstr &LoadMI = *--InsertPt; 4633 MachineOperand &LoadDst = LoadMI.getOperand(0); 4634 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4635 LoadDst.setSubReg(DstMO.getSubReg()); 4636 LoadDst.setIsUndef(); 4637 return &LoadMI; 4638 } 4639 } 4640 } 4641 4642 // Cannot fold. 4643 return nullptr; 4644 } 4645 4646 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4647 StackOffset &SOffset, 4648 bool *OutUseUnscaledOp, 4649 unsigned *OutUnscaledOp, 4650 int64_t *EmittableOffset) { 4651 // Set output values in case of early exit. 4652 if (EmittableOffset) 4653 *EmittableOffset = 0; 4654 if (OutUseUnscaledOp) 4655 *OutUseUnscaledOp = false; 4656 if (OutUnscaledOp) 4657 *OutUnscaledOp = 0; 4658 4659 // Exit early for structured vector spills/fills as they can't take an 4660 // immediate offset. 4661 switch (MI.getOpcode()) { 4662 default: 4663 break; 4664 case AArch64::LD1Twov2d: 4665 case AArch64::LD1Threev2d: 4666 case AArch64::LD1Fourv2d: 4667 case AArch64::LD1Twov1d: 4668 case AArch64::LD1Threev1d: 4669 case AArch64::LD1Fourv1d: 4670 case AArch64::ST1Twov2d: 4671 case AArch64::ST1Threev2d: 4672 case AArch64::ST1Fourv2d: 4673 case AArch64::ST1Twov1d: 4674 case AArch64::ST1Threev1d: 4675 case AArch64::ST1Fourv1d: 4676 case AArch64::ST1i8: 4677 case AArch64::ST1i16: 4678 case AArch64::ST1i32: 4679 case AArch64::ST1i64: 4680 case AArch64::IRG: 4681 case AArch64::IRGstack: 4682 case AArch64::STGloop: 4683 case AArch64::STZGloop: 4684 return AArch64FrameOffsetCannotUpdate; 4685 } 4686 4687 // Get the min/max offset and the scale. 4688 TypeSize ScaleValue(0U, false); 4689 unsigned Width; 4690 int64_t MinOff, MaxOff; 4691 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4692 MaxOff)) 4693 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4694 4695 // Construct the complete offset. 4696 bool IsMulVL = ScaleValue.isScalable(); 4697 unsigned Scale = ScaleValue.getKnownMinValue(); 4698 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4699 4700 const MachineOperand &ImmOpnd = 4701 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4702 Offset += ImmOpnd.getImm() * Scale; 4703 4704 // If the offset doesn't match the scale, we rewrite the instruction to 4705 // use the unscaled instruction instead. Likewise, if we have a negative 4706 // offset and there is an unscaled op to use. 4707 std::optional<unsigned> UnscaledOp = 4708 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4709 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4710 if (useUnscaledOp && 4711 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4712 MaxOff)) 4713 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4714 4715 Scale = ScaleValue.getKnownMinValue(); 4716 assert(IsMulVL == ScaleValue.isScalable() && 4717 "Unscaled opcode has different value for scalable"); 4718 4719 int64_t Remainder = Offset % Scale; 4720 assert(!(Remainder && useUnscaledOp) && 4721 "Cannot have remainder when using unscaled op"); 4722 4723 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4724 int64_t NewOffset = Offset / Scale; 4725 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4726 Offset = Remainder; 4727 else { 4728 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4729 Offset = Offset - NewOffset * Scale + Remainder; 4730 } 4731 4732 if (EmittableOffset) 4733 *EmittableOffset = NewOffset; 4734 if (OutUseUnscaledOp) 4735 *OutUseUnscaledOp = useUnscaledOp; 4736 if (OutUnscaledOp && UnscaledOp) 4737 *OutUnscaledOp = *UnscaledOp; 4738 4739 if (IsMulVL) 4740 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4741 else 4742 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4743 return AArch64FrameOffsetCanUpdate | 4744 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4745 } 4746 4747 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4748 unsigned FrameReg, StackOffset &Offset, 4749 const AArch64InstrInfo *TII) { 4750 unsigned Opcode = MI.getOpcode(); 4751 unsigned ImmIdx = FrameRegIdx + 1; 4752 4753 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4754 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4755 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4756 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4757 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4758 MI.eraseFromParent(); 4759 Offset = StackOffset(); 4760 return true; 4761 } 4762 4763 int64_t NewOffset; 4764 unsigned UnscaledOp; 4765 bool UseUnscaledOp; 4766 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4767 &UnscaledOp, &NewOffset); 4768 if (Status & AArch64FrameOffsetCanUpdate) { 4769 if (Status & AArch64FrameOffsetIsLegal) 4770 // Replace the FrameIndex with FrameReg. 4771 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4772 if (UseUnscaledOp) 4773 MI.setDesc(TII->get(UnscaledOp)); 4774 4775 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4776 return !Offset; 4777 } 4778 4779 return false; 4780 } 4781 4782 MCInst AArch64InstrInfo::getNop() const { 4783 return MCInstBuilder(AArch64::HINT).addImm(0); 4784 } 4785 4786 // AArch64 supports MachineCombiner. 4787 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4788 4789 // True when Opc sets flag 4790 static bool isCombineInstrSettingFlag(unsigned Opc) { 4791 switch (Opc) { 4792 case AArch64::ADDSWrr: 4793 case AArch64::ADDSWri: 4794 case AArch64::ADDSXrr: 4795 case AArch64::ADDSXri: 4796 case AArch64::SUBSWrr: 4797 case AArch64::SUBSXrr: 4798 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4799 case AArch64::SUBSWri: 4800 case AArch64::SUBSXri: 4801 return true; 4802 default: 4803 break; 4804 } 4805 return false; 4806 } 4807 4808 // 32b Opcodes that can be combined with a MUL 4809 static bool isCombineInstrCandidate32(unsigned Opc) { 4810 switch (Opc) { 4811 case AArch64::ADDWrr: 4812 case AArch64::ADDWri: 4813 case AArch64::SUBWrr: 4814 case AArch64::ADDSWrr: 4815 case AArch64::ADDSWri: 4816 case AArch64::SUBSWrr: 4817 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4818 case AArch64::SUBWri: 4819 case AArch64::SUBSWri: 4820 return true; 4821 default: 4822 break; 4823 } 4824 return false; 4825 } 4826 4827 // 64b Opcodes that can be combined with a MUL 4828 static bool isCombineInstrCandidate64(unsigned Opc) { 4829 switch (Opc) { 4830 case AArch64::ADDXrr: 4831 case AArch64::ADDXri: 4832 case AArch64::SUBXrr: 4833 case AArch64::ADDSXrr: 4834 case AArch64::ADDSXri: 4835 case AArch64::SUBSXrr: 4836 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4837 case AArch64::SUBXri: 4838 case AArch64::SUBSXri: 4839 case AArch64::ADDv8i8: 4840 case AArch64::ADDv16i8: 4841 case AArch64::ADDv4i16: 4842 case AArch64::ADDv8i16: 4843 case AArch64::ADDv2i32: 4844 case AArch64::ADDv4i32: 4845 case AArch64::SUBv8i8: 4846 case AArch64::SUBv16i8: 4847 case AArch64::SUBv4i16: 4848 case AArch64::SUBv8i16: 4849 case AArch64::SUBv2i32: 4850 case AArch64::SUBv4i32: 4851 return true; 4852 default: 4853 break; 4854 } 4855 return false; 4856 } 4857 4858 // FP Opcodes that can be combined with a FMUL. 4859 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4860 switch (Inst.getOpcode()) { 4861 default: 4862 break; 4863 case AArch64::FADDHrr: 4864 case AArch64::FADDSrr: 4865 case AArch64::FADDDrr: 4866 case AArch64::FADDv4f16: 4867 case AArch64::FADDv8f16: 4868 case AArch64::FADDv2f32: 4869 case AArch64::FADDv2f64: 4870 case AArch64::FADDv4f32: 4871 case AArch64::FSUBHrr: 4872 case AArch64::FSUBSrr: 4873 case AArch64::FSUBDrr: 4874 case AArch64::FSUBv4f16: 4875 case AArch64::FSUBv8f16: 4876 case AArch64::FSUBv2f32: 4877 case AArch64::FSUBv2f64: 4878 case AArch64::FSUBv4f32: 4879 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4880 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4881 // the target options or if FADD/FSUB has the contract fast-math flag. 4882 return Options.UnsafeFPMath || 4883 Options.AllowFPOpFusion == FPOpFusion::Fast || 4884 Inst.getFlag(MachineInstr::FmContract); 4885 return true; 4886 } 4887 return false; 4888 } 4889 4890 // Opcodes that can be combined with a MUL 4891 static bool isCombineInstrCandidate(unsigned Opc) { 4892 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4893 } 4894 4895 // 4896 // Utility routine that checks if \param MO is defined by an 4897 // \param CombineOpc instruction in the basic block \param MBB 4898 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4899 unsigned CombineOpc, unsigned ZeroReg = 0, 4900 bool CheckZeroReg = false) { 4901 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4902 MachineInstr *MI = nullptr; 4903 4904 if (MO.isReg() && MO.getReg().isVirtual()) 4905 MI = MRI.getUniqueVRegDef(MO.getReg()); 4906 // And it needs to be in the trace (otherwise, it won't have a depth). 4907 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4908 return false; 4909 // Must only used by the user we combine with. 4910 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4911 return false; 4912 4913 if (CheckZeroReg) { 4914 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4915 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4916 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4917 // The third input reg must be zero. 4918 if (MI->getOperand(3).getReg() != ZeroReg) 4919 return false; 4920 } 4921 4922 if (isCombineInstrSettingFlag(CombineOpc) && 4923 MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 4924 return false; 4925 4926 return true; 4927 } 4928 4929 // 4930 // Is \param MO defined by an integer multiply and can be combined? 4931 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4932 unsigned MulOpc, unsigned ZeroReg) { 4933 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4934 } 4935 4936 // 4937 // Is \param MO defined by a floating-point multiply and can be combined? 4938 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4939 unsigned MulOpc) { 4940 return canCombine(MBB, MO, MulOpc); 4941 } 4942 4943 // TODO: There are many more machine instruction opcodes to match: 4944 // 1. Other data types (integer, vectors) 4945 // 2. Other math / logic operations (xor, or) 4946 // 3. Other forms of the same operation (intrinsics and other variants) 4947 bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, 4948 bool Invert) const { 4949 if (Invert) 4950 return false; 4951 switch (Inst.getOpcode()) { 4952 // == Floating-point types == 4953 // -- Floating-point instructions -- 4954 case AArch64::FADDHrr: 4955 case AArch64::FADDSrr: 4956 case AArch64::FADDDrr: 4957 case AArch64::FMULHrr: 4958 case AArch64::FMULSrr: 4959 case AArch64::FMULDrr: 4960 case AArch64::FMULX16: 4961 case AArch64::FMULX32: 4962 case AArch64::FMULX64: 4963 // -- Advanced SIMD instructions -- 4964 case AArch64::FADDv4f16: 4965 case AArch64::FADDv8f16: 4966 case AArch64::FADDv2f32: 4967 case AArch64::FADDv4f32: 4968 case AArch64::FADDv2f64: 4969 case AArch64::FMULv4f16: 4970 case AArch64::FMULv8f16: 4971 case AArch64::FMULv2f32: 4972 case AArch64::FMULv4f32: 4973 case AArch64::FMULv2f64: 4974 case AArch64::FMULXv4f16: 4975 case AArch64::FMULXv8f16: 4976 case AArch64::FMULXv2f32: 4977 case AArch64::FMULXv4f32: 4978 case AArch64::FMULXv2f64: 4979 // -- SVE instructions -- 4980 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX 4981 // in the SVE instruction set (though there are predicated ones). 4982 case AArch64::FADD_ZZZ_H: 4983 case AArch64::FADD_ZZZ_S: 4984 case AArch64::FADD_ZZZ_D: 4985 case AArch64::FMUL_ZZZ_H: 4986 case AArch64::FMUL_ZZZ_S: 4987 case AArch64::FMUL_ZZZ_D: 4988 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath || 4989 (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && 4990 Inst.getFlag(MachineInstr::MIFlag::FmNsz)); 4991 4992 // == Integer types == 4993 // -- Base instructions -- 4994 // Opcodes MULWrr and MULXrr don't exist because 4995 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of 4996 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively. 4997 // The machine-combiner does not support three-source-operands machine 4998 // instruction. So we cannot reassociate MULs. 4999 case AArch64::ADDWrr: 5000 case AArch64::ADDXrr: 5001 case AArch64::ANDWrr: 5002 case AArch64::ANDXrr: 5003 case AArch64::ORRWrr: 5004 case AArch64::ORRXrr: 5005 case AArch64::EORWrr: 5006 case AArch64::EORXrr: 5007 case AArch64::EONWrr: 5008 case AArch64::EONXrr: 5009 // -- Advanced SIMD instructions -- 5010 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL 5011 // in the Advanced SIMD instruction set. 5012 case AArch64::ADDv8i8: 5013 case AArch64::ADDv16i8: 5014 case AArch64::ADDv4i16: 5015 case AArch64::ADDv8i16: 5016 case AArch64::ADDv2i32: 5017 case AArch64::ADDv4i32: 5018 case AArch64::ADDv1i64: 5019 case AArch64::ADDv2i64: 5020 case AArch64::MULv8i8: 5021 case AArch64::MULv16i8: 5022 case AArch64::MULv4i16: 5023 case AArch64::MULv8i16: 5024 case AArch64::MULv2i32: 5025 case AArch64::MULv4i32: 5026 case AArch64::ANDv8i8: 5027 case AArch64::ANDv16i8: 5028 case AArch64::ORRv8i8: 5029 case AArch64::ORRv16i8: 5030 case AArch64::EORv8i8: 5031 case AArch64::EORv16i8: 5032 // -- SVE instructions -- 5033 case AArch64::ADD_ZZZ_B: 5034 case AArch64::ADD_ZZZ_H: 5035 case AArch64::ADD_ZZZ_S: 5036 case AArch64::ADD_ZZZ_D: 5037 case AArch64::MUL_ZZZ_B: 5038 case AArch64::MUL_ZZZ_H: 5039 case AArch64::MUL_ZZZ_S: 5040 case AArch64::MUL_ZZZ_D: 5041 case AArch64::AND_ZZZ: 5042 case AArch64::ORR_ZZZ: 5043 case AArch64::EOR_ZZZ: 5044 return true; 5045 5046 default: 5047 return false; 5048 } 5049 } 5050 5051 /// Find instructions that can be turned into madd. 5052 static bool getMaddPatterns(MachineInstr &Root, 5053 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5054 unsigned Opc = Root.getOpcode(); 5055 MachineBasicBlock &MBB = *Root.getParent(); 5056 bool Found = false; 5057 5058 if (!isCombineInstrCandidate(Opc)) 5059 return false; 5060 if (isCombineInstrSettingFlag(Opc)) { 5061 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 5062 // When NZCV is live bail out. 5063 if (Cmp_NZCV == -1) 5064 return false; 5065 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 5066 // When opcode can't change bail out. 5067 // CHECKME: do we miss any cases for opcode conversion? 5068 if (NewOpc == Opc) 5069 return false; 5070 Opc = NewOpc; 5071 } 5072 5073 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 5074 MachineCombinerPattern Pattern) { 5075 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 5076 Patterns.push_back(Pattern); 5077 Found = true; 5078 } 5079 }; 5080 5081 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 5082 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 5083 Patterns.push_back(Pattern); 5084 Found = true; 5085 } 5086 }; 5087 5088 typedef MachineCombinerPattern MCP; 5089 5090 switch (Opc) { 5091 default: 5092 break; 5093 case AArch64::ADDWrr: 5094 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5095 "ADDWrr does not have register operands"); 5096 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 5097 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 5098 break; 5099 case AArch64::ADDXrr: 5100 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 5101 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 5102 break; 5103 case AArch64::SUBWrr: 5104 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 5105 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 5106 break; 5107 case AArch64::SUBXrr: 5108 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 5109 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 5110 break; 5111 case AArch64::ADDWri: 5112 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 5113 break; 5114 case AArch64::ADDXri: 5115 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 5116 break; 5117 case AArch64::SUBWri: 5118 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 5119 break; 5120 case AArch64::SUBXri: 5121 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 5122 break; 5123 case AArch64::ADDv8i8: 5124 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 5125 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 5126 break; 5127 case AArch64::ADDv16i8: 5128 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 5129 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 5130 break; 5131 case AArch64::ADDv4i16: 5132 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 5133 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 5134 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 5135 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 5136 break; 5137 case AArch64::ADDv8i16: 5138 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 5139 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 5140 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 5141 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 5142 break; 5143 case AArch64::ADDv2i32: 5144 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 5145 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 5146 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 5147 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 5148 break; 5149 case AArch64::ADDv4i32: 5150 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 5151 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 5152 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 5153 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 5154 break; 5155 case AArch64::SUBv8i8: 5156 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 5157 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 5158 break; 5159 case AArch64::SUBv16i8: 5160 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 5161 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 5162 break; 5163 case AArch64::SUBv4i16: 5164 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 5165 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 5166 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 5167 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 5168 break; 5169 case AArch64::SUBv8i16: 5170 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 5171 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 5172 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 5173 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 5174 break; 5175 case AArch64::SUBv2i32: 5176 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 5177 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 5178 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 5179 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 5180 break; 5181 case AArch64::SUBv4i32: 5182 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 5183 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 5184 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 5185 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 5186 break; 5187 } 5188 return Found; 5189 } 5190 /// Floating-Point Support 5191 5192 /// Find instructions that can be turned into madd. 5193 static bool getFMAPatterns(MachineInstr &Root, 5194 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5195 5196 if (!isCombineInstrCandidateFP(Root)) 5197 return false; 5198 5199 MachineBasicBlock &MBB = *Root.getParent(); 5200 bool Found = false; 5201 5202 auto Match = [&](int Opcode, int Operand, 5203 MachineCombinerPattern Pattern) -> bool { 5204 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 5205 Patterns.push_back(Pattern); 5206 return true; 5207 } 5208 return false; 5209 }; 5210 5211 typedef MachineCombinerPattern MCP; 5212 5213 switch (Root.getOpcode()) { 5214 default: 5215 assert(false && "Unsupported FP instruction in combiner\n"); 5216 break; 5217 case AArch64::FADDHrr: 5218 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5219 "FADDHrr does not have register operands"); 5220 5221 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 5222 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 5223 break; 5224 case AArch64::FADDSrr: 5225 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 5226 "FADDSrr does not have register operands"); 5227 5228 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 5229 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 5230 5231 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 5232 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 5233 break; 5234 case AArch64::FADDDrr: 5235 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 5236 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 5237 5238 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 5239 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 5240 break; 5241 case AArch64::FADDv4f16: 5242 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 5243 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 5244 5245 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 5246 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 5247 break; 5248 case AArch64::FADDv8f16: 5249 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 5250 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 5251 5252 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 5253 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 5254 break; 5255 case AArch64::FADDv2f32: 5256 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 5257 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 5258 5259 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 5260 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 5261 break; 5262 case AArch64::FADDv2f64: 5263 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 5264 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 5265 5266 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 5267 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 5268 break; 5269 case AArch64::FADDv4f32: 5270 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 5271 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 5272 5273 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 5274 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 5275 break; 5276 case AArch64::FSUBHrr: 5277 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 5278 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 5279 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 5280 break; 5281 case AArch64::FSUBSrr: 5282 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 5283 5284 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 5285 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 5286 5287 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 5288 break; 5289 case AArch64::FSUBDrr: 5290 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 5291 5292 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 5293 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 5294 5295 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 5296 break; 5297 case AArch64::FSUBv4f16: 5298 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 5299 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 5300 5301 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 5302 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 5303 break; 5304 case AArch64::FSUBv8f16: 5305 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 5306 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 5307 5308 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 5309 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 5310 break; 5311 case AArch64::FSUBv2f32: 5312 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 5313 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 5314 5315 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 5316 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 5317 break; 5318 case AArch64::FSUBv2f64: 5319 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 5320 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 5321 5322 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 5323 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 5324 break; 5325 case AArch64::FSUBv4f32: 5326 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 5327 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 5328 5329 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 5330 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 5331 break; 5332 } 5333 return Found; 5334 } 5335 5336 static bool getFMULPatterns(MachineInstr &Root, 5337 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 5338 MachineBasicBlock &MBB = *Root.getParent(); 5339 bool Found = false; 5340 5341 auto Match = [&](unsigned Opcode, int Operand, 5342 MachineCombinerPattern Pattern) -> bool { 5343 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5344 MachineOperand &MO = Root.getOperand(Operand); 5345 MachineInstr *MI = nullptr; 5346 if (MO.isReg() && MO.getReg().isVirtual()) 5347 MI = MRI.getUniqueVRegDef(MO.getReg()); 5348 // Ignore No-op COPYs in FMUL(COPY(DUP(..))) 5349 if (MI && MI->getOpcode() == TargetOpcode::COPY && 5350 MI->getOperand(1).getReg().isVirtual()) 5351 MI = MRI.getUniqueVRegDef(MI->getOperand(1).getReg()); 5352 if (MI && MI->getOpcode() == Opcode) { 5353 Patterns.push_back(Pattern); 5354 return true; 5355 } 5356 return false; 5357 }; 5358 5359 typedef MachineCombinerPattern MCP; 5360 5361 switch (Root.getOpcode()) { 5362 default: 5363 return false; 5364 case AArch64::FMULv2f32: 5365 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1); 5366 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2); 5367 break; 5368 case AArch64::FMULv2f64: 5369 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1); 5370 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2); 5371 break; 5372 case AArch64::FMULv4f16: 5373 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1); 5374 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2); 5375 break; 5376 case AArch64::FMULv4f32: 5377 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1); 5378 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2); 5379 break; 5380 case AArch64::FMULv8f16: 5381 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1); 5382 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2); 5383 break; 5384 } 5385 5386 return Found; 5387 } 5388 5389 /// Return true when a code sequence can improve throughput. It 5390 /// should be called only for instructions in loops. 5391 /// \param Pattern - combiner pattern 5392 bool AArch64InstrInfo::isThroughputPattern( 5393 MachineCombinerPattern Pattern) const { 5394 switch (Pattern) { 5395 default: 5396 break; 5397 case MachineCombinerPattern::FMULADDH_OP1: 5398 case MachineCombinerPattern::FMULADDH_OP2: 5399 case MachineCombinerPattern::FMULSUBH_OP1: 5400 case MachineCombinerPattern::FMULSUBH_OP2: 5401 case MachineCombinerPattern::FMULADDS_OP1: 5402 case MachineCombinerPattern::FMULADDS_OP2: 5403 case MachineCombinerPattern::FMULSUBS_OP1: 5404 case MachineCombinerPattern::FMULSUBS_OP2: 5405 case MachineCombinerPattern::FMULADDD_OP1: 5406 case MachineCombinerPattern::FMULADDD_OP2: 5407 case MachineCombinerPattern::FMULSUBD_OP1: 5408 case MachineCombinerPattern::FMULSUBD_OP2: 5409 case MachineCombinerPattern::FNMULSUBH_OP1: 5410 case MachineCombinerPattern::FNMULSUBS_OP1: 5411 case MachineCombinerPattern::FNMULSUBD_OP1: 5412 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5413 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5414 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5415 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5416 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5417 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5418 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5419 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5420 case MachineCombinerPattern::FMLAv4f16_OP2: 5421 case MachineCombinerPattern::FMLAv4f16_OP1: 5422 case MachineCombinerPattern::FMLAv8f16_OP1: 5423 case MachineCombinerPattern::FMLAv8f16_OP2: 5424 case MachineCombinerPattern::FMLAv2f32_OP2: 5425 case MachineCombinerPattern::FMLAv2f32_OP1: 5426 case MachineCombinerPattern::FMLAv2f64_OP1: 5427 case MachineCombinerPattern::FMLAv2f64_OP2: 5428 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5429 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5430 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5431 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5432 case MachineCombinerPattern::FMLAv4f32_OP1: 5433 case MachineCombinerPattern::FMLAv4f32_OP2: 5434 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5435 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5436 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 5437 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5438 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 5439 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5440 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5441 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5442 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5443 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5444 case MachineCombinerPattern::FMLSv4f16_OP1: 5445 case MachineCombinerPattern::FMLSv4f16_OP2: 5446 case MachineCombinerPattern::FMLSv8f16_OP1: 5447 case MachineCombinerPattern::FMLSv8f16_OP2: 5448 case MachineCombinerPattern::FMLSv2f32_OP2: 5449 case MachineCombinerPattern::FMLSv2f64_OP2: 5450 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5451 case MachineCombinerPattern::FMLSv4f32_OP2: 5452 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 5453 case MachineCombinerPattern::FMULv2i32_indexed_OP2: 5454 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 5455 case MachineCombinerPattern::FMULv2i64_indexed_OP2: 5456 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 5457 case MachineCombinerPattern::FMULv4i16_indexed_OP2: 5458 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 5459 case MachineCombinerPattern::FMULv4i32_indexed_OP2: 5460 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 5461 case MachineCombinerPattern::FMULv8i16_indexed_OP2: 5462 case MachineCombinerPattern::MULADDv8i8_OP1: 5463 case MachineCombinerPattern::MULADDv8i8_OP2: 5464 case MachineCombinerPattern::MULADDv16i8_OP1: 5465 case MachineCombinerPattern::MULADDv16i8_OP2: 5466 case MachineCombinerPattern::MULADDv4i16_OP1: 5467 case MachineCombinerPattern::MULADDv4i16_OP2: 5468 case MachineCombinerPattern::MULADDv8i16_OP1: 5469 case MachineCombinerPattern::MULADDv8i16_OP2: 5470 case MachineCombinerPattern::MULADDv2i32_OP1: 5471 case MachineCombinerPattern::MULADDv2i32_OP2: 5472 case MachineCombinerPattern::MULADDv4i32_OP1: 5473 case MachineCombinerPattern::MULADDv4i32_OP2: 5474 case MachineCombinerPattern::MULSUBv8i8_OP1: 5475 case MachineCombinerPattern::MULSUBv8i8_OP2: 5476 case MachineCombinerPattern::MULSUBv16i8_OP1: 5477 case MachineCombinerPattern::MULSUBv16i8_OP2: 5478 case MachineCombinerPattern::MULSUBv4i16_OP1: 5479 case MachineCombinerPattern::MULSUBv4i16_OP2: 5480 case MachineCombinerPattern::MULSUBv8i16_OP1: 5481 case MachineCombinerPattern::MULSUBv8i16_OP2: 5482 case MachineCombinerPattern::MULSUBv2i32_OP1: 5483 case MachineCombinerPattern::MULSUBv2i32_OP2: 5484 case MachineCombinerPattern::MULSUBv4i32_OP1: 5485 case MachineCombinerPattern::MULSUBv4i32_OP2: 5486 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5487 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5488 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5489 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5490 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5491 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5492 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5493 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5494 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5495 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5496 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5497 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5498 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5499 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5500 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5501 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5502 return true; 5503 } // end switch (Pattern) 5504 return false; 5505 } 5506 5507 /// Find other MI combine patterns. 5508 static bool getMiscPatterns(MachineInstr &Root, 5509 SmallVectorImpl<MachineCombinerPattern> &Patterns) 5510 { 5511 // A - (B + C) ==> (A - B) - C or (A - C) - B 5512 unsigned Opc = Root.getOpcode(); 5513 MachineBasicBlock &MBB = *Root.getParent(); 5514 5515 switch (Opc) { 5516 case AArch64::SUBWrr: 5517 case AArch64::SUBSWrr: 5518 case AArch64::SUBXrr: 5519 case AArch64::SUBSXrr: 5520 // Found candidate root. 5521 break; 5522 default: 5523 return false; 5524 } 5525 5526 if (isCombineInstrSettingFlag(Opc) && 5527 Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 5528 return false; 5529 5530 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) || 5531 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) || 5532 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) || 5533 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) { 5534 Patterns.push_back(MachineCombinerPattern::SUBADD_OP1); 5535 Patterns.push_back(MachineCombinerPattern::SUBADD_OP2); 5536 return true; 5537 } 5538 5539 return false; 5540 } 5541 5542 /// Return true when there is potentially a faster code sequence for an 5543 /// instruction chain ending in \p Root. All potential patterns are listed in 5544 /// the \p Pattern vector. Pattern should be sorted in priority order since the 5545 /// pattern evaluator stops checking as soon as it finds a faster sequence. 5546 5547 bool AArch64InstrInfo::getMachineCombinerPatterns( 5548 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 5549 bool DoRegPressureReduce) const { 5550 // Integer patterns 5551 if (getMaddPatterns(Root, Patterns)) 5552 return true; 5553 // Floating point patterns 5554 if (getFMULPatterns(Root, Patterns)) 5555 return true; 5556 if (getFMAPatterns(Root, Patterns)) 5557 return true; 5558 5559 // Other patterns 5560 if (getMiscPatterns(Root, Patterns)) 5561 return true; 5562 5563 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 5564 DoRegPressureReduce); 5565 } 5566 5567 enum class FMAInstKind { Default, Indexed, Accumulator }; 5568 /// genFusedMultiply - Generate fused multiply instructions. 5569 /// This function supports both integer and floating point instructions. 5570 /// A typical example: 5571 /// F|MUL I=A,B,0 5572 /// F|ADD R,I,C 5573 /// ==> F|MADD R,A,B,C 5574 /// \param MF Containing MachineFunction 5575 /// \param MRI Register information 5576 /// \param TII Target information 5577 /// \param Root is the F|ADD instruction 5578 /// \param [out] InsInstrs is a vector of machine instructions and will 5579 /// contain the generated madd instruction 5580 /// \param IdxMulOpd is index of operand in Root that is the result of 5581 /// the F|MUL. In the example above IdxMulOpd is 1. 5582 /// \param MaddOpc the opcode fo the f|madd instruction 5583 /// \param RC Register class of operands 5584 /// \param kind of fma instruction (addressing mode) to be generated 5585 /// \param ReplacedAddend is the result register from the instruction 5586 /// replacing the non-combined operand, if any. 5587 static MachineInstr * 5588 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5589 const TargetInstrInfo *TII, MachineInstr &Root, 5590 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5591 unsigned MaddOpc, const TargetRegisterClass *RC, 5592 FMAInstKind kind = FMAInstKind::Default, 5593 const Register *ReplacedAddend = nullptr) { 5594 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5595 5596 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5597 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5598 Register ResultReg = Root.getOperand(0).getReg(); 5599 Register SrcReg0 = MUL->getOperand(1).getReg(); 5600 bool Src0IsKill = MUL->getOperand(1).isKill(); 5601 Register SrcReg1 = MUL->getOperand(2).getReg(); 5602 bool Src1IsKill = MUL->getOperand(2).isKill(); 5603 5604 Register SrcReg2; 5605 bool Src2IsKill; 5606 if (ReplacedAddend) { 5607 // If we just generated a new addend, we must be it's only use. 5608 SrcReg2 = *ReplacedAddend; 5609 Src2IsKill = true; 5610 } else { 5611 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5612 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5613 } 5614 5615 if (ResultReg.isVirtual()) 5616 MRI.constrainRegClass(ResultReg, RC); 5617 if (SrcReg0.isVirtual()) 5618 MRI.constrainRegClass(SrcReg0, RC); 5619 if (SrcReg1.isVirtual()) 5620 MRI.constrainRegClass(SrcReg1, RC); 5621 if (SrcReg2.isVirtual()) 5622 MRI.constrainRegClass(SrcReg2, RC); 5623 5624 MachineInstrBuilder MIB; 5625 if (kind == FMAInstKind::Default) 5626 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5627 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5628 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5629 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5630 else if (kind == FMAInstKind::Indexed) 5631 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5632 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5633 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5634 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5635 .addImm(MUL->getOperand(3).getImm()); 5636 else if (kind == FMAInstKind::Accumulator) 5637 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5638 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5639 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5640 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5641 else 5642 assert(false && "Invalid FMA instruction kind \n"); 5643 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5644 InsInstrs.push_back(MIB); 5645 return MUL; 5646 } 5647 5648 /// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane) 5649 static MachineInstr * 5650 genIndexedMultiply(MachineInstr &Root, 5651 SmallVectorImpl<MachineInstr *> &InsInstrs, 5652 unsigned IdxDupOp, unsigned MulOpc, 5653 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) { 5654 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) && 5655 "Invalid index of FMUL operand"); 5656 5657 MachineFunction &MF = *Root.getMF(); 5658 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5659 5660 MachineInstr *Dup = 5661 MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg()); 5662 5663 if (Dup->getOpcode() == TargetOpcode::COPY) 5664 Dup = MRI.getUniqueVRegDef(Dup->getOperand(1).getReg()); 5665 5666 Register DupSrcReg = Dup->getOperand(1).getReg(); 5667 MRI.clearKillFlags(DupSrcReg); 5668 MRI.constrainRegClass(DupSrcReg, RC); 5669 5670 unsigned DupSrcLane = Dup->getOperand(2).getImm(); 5671 5672 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1; 5673 MachineOperand &MulOp = Root.getOperand(IdxMulOp); 5674 5675 Register ResultReg = Root.getOperand(0).getReg(); 5676 5677 MachineInstrBuilder MIB; 5678 MIB = BuildMI(MF, MIMetadata(Root), TII->get(MulOpc), ResultReg) 5679 .add(MulOp) 5680 .addReg(DupSrcReg) 5681 .addImm(DupSrcLane); 5682 5683 InsInstrs.push_back(MIB); 5684 return &Root; 5685 } 5686 5687 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5688 /// instructions. 5689 /// 5690 /// \see genFusedMultiply 5691 static MachineInstr *genFusedMultiplyAcc( 5692 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5693 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5694 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5695 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5696 FMAInstKind::Accumulator); 5697 } 5698 5699 /// genNeg - Helper to generate an intermediate negation of the second operand 5700 /// of Root 5701 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5702 const TargetInstrInfo *TII, MachineInstr &Root, 5703 SmallVectorImpl<MachineInstr *> &InsInstrs, 5704 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5705 unsigned MnegOpc, const TargetRegisterClass *RC) { 5706 Register NewVR = MRI.createVirtualRegister(RC); 5707 MachineInstrBuilder MIB = 5708 BuildMI(MF, MIMetadata(Root), TII->get(MnegOpc), NewVR) 5709 .add(Root.getOperand(2)); 5710 InsInstrs.push_back(MIB); 5711 5712 assert(InstrIdxForVirtReg.empty()); 5713 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5714 5715 return NewVR; 5716 } 5717 5718 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5719 /// instructions with an additional negation of the accumulator 5720 static MachineInstr *genFusedMultiplyAccNeg( 5721 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5722 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5723 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5724 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5725 assert(IdxMulOpd == 1); 5726 5727 Register NewVR = 5728 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5729 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5730 FMAInstKind::Accumulator, &NewVR); 5731 } 5732 5733 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5734 /// instructions. 5735 /// 5736 /// \see genFusedMultiply 5737 static MachineInstr *genFusedMultiplyIdx( 5738 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5739 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5740 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5741 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5742 FMAInstKind::Indexed); 5743 } 5744 5745 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5746 /// instructions with an additional negation of the accumulator 5747 static MachineInstr *genFusedMultiplyIdxNeg( 5748 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5749 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5750 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5751 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5752 assert(IdxMulOpd == 1); 5753 5754 Register NewVR = 5755 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5756 5757 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5758 FMAInstKind::Indexed, &NewVR); 5759 } 5760 5761 /// genMaddR - Generate madd instruction and combine mul and add using 5762 /// an extra virtual register 5763 /// Example - an ADD intermediate needs to be stored in a register: 5764 /// MUL I=A,B,0 5765 /// ADD R,I,Imm 5766 /// ==> ORR V, ZR, Imm 5767 /// ==> MADD R,A,B,V 5768 /// \param MF Containing MachineFunction 5769 /// \param MRI Register information 5770 /// \param TII Target information 5771 /// \param Root is the ADD instruction 5772 /// \param [out] InsInstrs is a vector of machine instructions and will 5773 /// contain the generated madd instruction 5774 /// \param IdxMulOpd is index of operand in Root that is the result of 5775 /// the MUL. In the example above IdxMulOpd is 1. 5776 /// \param MaddOpc the opcode fo the madd instruction 5777 /// \param VR is a virtual register that holds the value of an ADD operand 5778 /// (V in the example above). 5779 /// \param RC Register class of operands 5780 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5781 const TargetInstrInfo *TII, MachineInstr &Root, 5782 SmallVectorImpl<MachineInstr *> &InsInstrs, 5783 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5784 const TargetRegisterClass *RC) { 5785 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5786 5787 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5788 Register ResultReg = Root.getOperand(0).getReg(); 5789 Register SrcReg0 = MUL->getOperand(1).getReg(); 5790 bool Src0IsKill = MUL->getOperand(1).isKill(); 5791 Register SrcReg1 = MUL->getOperand(2).getReg(); 5792 bool Src1IsKill = MUL->getOperand(2).isKill(); 5793 5794 if (ResultReg.isVirtual()) 5795 MRI.constrainRegClass(ResultReg, RC); 5796 if (SrcReg0.isVirtual()) 5797 MRI.constrainRegClass(SrcReg0, RC); 5798 if (SrcReg1.isVirtual()) 5799 MRI.constrainRegClass(SrcReg1, RC); 5800 if (Register::isVirtualRegister(VR)) 5801 MRI.constrainRegClass(VR, RC); 5802 5803 MachineInstrBuilder MIB = 5804 BuildMI(MF, MIMetadata(Root), TII->get(MaddOpc), ResultReg) 5805 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5806 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5807 .addReg(VR); 5808 // Insert the MADD 5809 InsInstrs.push_back(MIB); 5810 return MUL; 5811 } 5812 5813 /// Do the following transformation 5814 /// A - (B + C) ==> (A - B) - C 5815 /// A - (B + C) ==> (A - C) - B 5816 static void 5817 genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI, 5818 const TargetInstrInfo *TII, MachineInstr &Root, 5819 SmallVectorImpl<MachineInstr *> &InsInstrs, 5820 SmallVectorImpl<MachineInstr *> &DelInstrs, 5821 unsigned IdxOpd1, 5822 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { 5823 assert(IdxOpd1 == 1 || IdxOpd1 == 2); 5824 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1; 5825 MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); 5826 5827 Register ResultReg = Root.getOperand(0).getReg(); 5828 Register RegA = Root.getOperand(1).getReg(); 5829 bool RegAIsKill = Root.getOperand(1).isKill(); 5830 Register RegB = AddMI->getOperand(IdxOpd1).getReg(); 5831 bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill(); 5832 Register RegC = AddMI->getOperand(IdxOtherOpd).getReg(); 5833 bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill(); 5834 Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA)); 5835 5836 unsigned Opcode = Root.getOpcode(); 5837 if (Opcode == AArch64::SUBSWrr) 5838 Opcode = AArch64::SUBWrr; 5839 else if (Opcode == AArch64::SUBSXrr) 5840 Opcode = AArch64::SUBXrr; 5841 else 5842 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) && 5843 "Unexpected instruction opcode."); 5844 5845 MachineInstrBuilder MIB1 = 5846 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), NewVR) 5847 .addReg(RegA, getKillRegState(RegAIsKill)) 5848 .addReg(RegB, getKillRegState(RegBIsKill)); 5849 MachineInstrBuilder MIB2 = 5850 BuildMI(MF, MIMetadata(Root), TII->get(Opcode), ResultReg) 5851 .addReg(NewVR, getKillRegState(true)) 5852 .addReg(RegC, getKillRegState(RegCIsKill)); 5853 5854 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5855 InsInstrs.push_back(MIB1); 5856 InsInstrs.push_back(MIB2); 5857 DelInstrs.push_back(AddMI); 5858 } 5859 5860 /// When getMachineCombinerPatterns() finds potential patterns, 5861 /// this function generates the instructions that could replace the 5862 /// original code sequence 5863 void AArch64InstrInfo::genAlternativeCodeSequence( 5864 MachineInstr &Root, MachineCombinerPattern Pattern, 5865 SmallVectorImpl<MachineInstr *> &InsInstrs, 5866 SmallVectorImpl<MachineInstr *> &DelInstrs, 5867 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5868 MachineBasicBlock &MBB = *Root.getParent(); 5869 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5870 MachineFunction &MF = *MBB.getParent(); 5871 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5872 5873 MachineInstr *MUL = nullptr; 5874 const TargetRegisterClass *RC; 5875 unsigned Opc; 5876 switch (Pattern) { 5877 default: 5878 // Reassociate instructions. 5879 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5880 DelInstrs, InstrIdxForVirtReg); 5881 return; 5882 case MachineCombinerPattern::SUBADD_OP1: 5883 // A - (B + C) 5884 // ==> (A - B) - C 5885 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1, 5886 InstrIdxForVirtReg); 5887 break; 5888 case MachineCombinerPattern::SUBADD_OP2: 5889 // A - (B + C) 5890 // ==> (A - C) - B 5891 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2, 5892 InstrIdxForVirtReg); 5893 break; 5894 case MachineCombinerPattern::MULADDW_OP1: 5895 case MachineCombinerPattern::MULADDX_OP1: 5896 // MUL I=A,B,0 5897 // ADD R,I,C 5898 // ==> MADD R,A,B,C 5899 // --- Create(MADD); 5900 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 5901 Opc = AArch64::MADDWrrr; 5902 RC = &AArch64::GPR32RegClass; 5903 } else { 5904 Opc = AArch64::MADDXrrr; 5905 RC = &AArch64::GPR64RegClass; 5906 } 5907 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5908 break; 5909 case MachineCombinerPattern::MULADDW_OP2: 5910 case MachineCombinerPattern::MULADDX_OP2: 5911 // MUL I=A,B,0 5912 // ADD R,C,I 5913 // ==> MADD R,A,B,C 5914 // --- Create(MADD); 5915 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 5916 Opc = AArch64::MADDWrrr; 5917 RC = &AArch64::GPR32RegClass; 5918 } else { 5919 Opc = AArch64::MADDXrrr; 5920 RC = &AArch64::GPR64RegClass; 5921 } 5922 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5923 break; 5924 case MachineCombinerPattern::MULADDWI_OP1: 5925 case MachineCombinerPattern::MULADDXI_OP1: { 5926 // MUL I=A,B,0 5927 // ADD R,I,Imm 5928 // ==> MOV V, Imm 5929 // ==> MADD R,A,B,V 5930 // --- Create(MADD); 5931 const TargetRegisterClass *OrrRC; 5932 unsigned BitSize, OrrOpc, ZeroReg; 5933 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 5934 OrrOpc = AArch64::ORRWri; 5935 OrrRC = &AArch64::GPR32spRegClass; 5936 BitSize = 32; 5937 ZeroReg = AArch64::WZR; 5938 Opc = AArch64::MADDWrrr; 5939 RC = &AArch64::GPR32RegClass; 5940 } else { 5941 OrrOpc = AArch64::ORRXri; 5942 OrrRC = &AArch64::GPR64spRegClass; 5943 BitSize = 64; 5944 ZeroReg = AArch64::XZR; 5945 Opc = AArch64::MADDXrrr; 5946 RC = &AArch64::GPR64RegClass; 5947 } 5948 Register NewVR = MRI.createVirtualRegister(OrrRC); 5949 uint64_t Imm = Root.getOperand(2).getImm(); 5950 5951 if (Root.getOperand(3).isImm()) { 5952 unsigned Val = Root.getOperand(3).getImm(); 5953 Imm = Imm << Val; 5954 } 5955 uint64_t UImm = SignExtend64(Imm, BitSize); 5956 // The immediate can be composed via a single instruction. 5957 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 5958 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 5959 if (Insn.size() != 1) 5960 return; 5961 auto MovI = Insn.begin(); 5962 MachineInstrBuilder MIB1; 5963 // MOV is an alias for one of three instructions: movz, movn, and orr. 5964 if (MovI->Opcode == OrrOpc) 5965 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 5966 .addReg(ZeroReg) 5967 .addImm(MovI->Op2); 5968 else { 5969 if (BitSize == 32) 5970 assert((MovI->Opcode == AArch64::MOVNWi || 5971 MovI->Opcode == AArch64::MOVZWi) && 5972 "Expected opcode"); 5973 else 5974 assert((MovI->Opcode == AArch64::MOVNXi || 5975 MovI->Opcode == AArch64::MOVZXi) && 5976 "Expected opcode"); 5977 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 5978 .addImm(MovI->Op1) 5979 .addImm(MovI->Op2); 5980 } 5981 InsInstrs.push_back(MIB1); 5982 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5983 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5984 break; 5985 } 5986 case MachineCombinerPattern::MULSUBW_OP1: 5987 case MachineCombinerPattern::MULSUBX_OP1: { 5988 // MUL I=A,B,0 5989 // SUB R,I, C 5990 // ==> SUB V, 0, C 5991 // ==> MADD R,A,B,V // = -C + A*B 5992 // --- Create(MADD); 5993 const TargetRegisterClass *SubRC; 5994 unsigned SubOpc, ZeroReg; 5995 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 5996 SubOpc = AArch64::SUBWrr; 5997 SubRC = &AArch64::GPR32spRegClass; 5998 ZeroReg = AArch64::WZR; 5999 Opc = AArch64::MADDWrrr; 6000 RC = &AArch64::GPR32RegClass; 6001 } else { 6002 SubOpc = AArch64::SUBXrr; 6003 SubRC = &AArch64::GPR64spRegClass; 6004 ZeroReg = AArch64::XZR; 6005 Opc = AArch64::MADDXrrr; 6006 RC = &AArch64::GPR64RegClass; 6007 } 6008 Register NewVR = MRI.createVirtualRegister(SubRC); 6009 // SUB NewVR, 0, C 6010 MachineInstrBuilder MIB1 = 6011 BuildMI(MF, MIMetadata(Root), TII->get(SubOpc), NewVR) 6012 .addReg(ZeroReg) 6013 .add(Root.getOperand(2)); 6014 InsInstrs.push_back(MIB1); 6015 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6016 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6017 break; 6018 } 6019 case MachineCombinerPattern::MULSUBW_OP2: 6020 case MachineCombinerPattern::MULSUBX_OP2: 6021 // MUL I=A,B,0 6022 // SUB R,C,I 6023 // ==> MSUB R,A,B,C (computes C - A*B) 6024 // --- Create(MSUB); 6025 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 6026 Opc = AArch64::MSUBWrrr; 6027 RC = &AArch64::GPR32RegClass; 6028 } else { 6029 Opc = AArch64::MSUBXrrr; 6030 RC = &AArch64::GPR64RegClass; 6031 } 6032 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6033 break; 6034 case MachineCombinerPattern::MULSUBWI_OP1: 6035 case MachineCombinerPattern::MULSUBXI_OP1: { 6036 // MUL I=A,B,0 6037 // SUB R,I, Imm 6038 // ==> MOV V, -Imm 6039 // ==> MADD R,A,B,V // = -Imm + A*B 6040 // --- Create(MADD); 6041 const TargetRegisterClass *OrrRC; 6042 unsigned BitSize, OrrOpc, ZeroReg; 6043 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 6044 OrrOpc = AArch64::ORRWri; 6045 OrrRC = &AArch64::GPR32spRegClass; 6046 BitSize = 32; 6047 ZeroReg = AArch64::WZR; 6048 Opc = AArch64::MADDWrrr; 6049 RC = &AArch64::GPR32RegClass; 6050 } else { 6051 OrrOpc = AArch64::ORRXri; 6052 OrrRC = &AArch64::GPR64spRegClass; 6053 BitSize = 64; 6054 ZeroReg = AArch64::XZR; 6055 Opc = AArch64::MADDXrrr; 6056 RC = &AArch64::GPR64RegClass; 6057 } 6058 Register NewVR = MRI.createVirtualRegister(OrrRC); 6059 uint64_t Imm = Root.getOperand(2).getImm(); 6060 if (Root.getOperand(3).isImm()) { 6061 unsigned Val = Root.getOperand(3).getImm(); 6062 Imm = Imm << Val; 6063 } 6064 uint64_t UImm = SignExtend64(-Imm, BitSize); 6065 // The immediate can be composed via a single instruction. 6066 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; 6067 AArch64_IMM::expandMOVImm(UImm, BitSize, Insn); 6068 if (Insn.size() != 1) 6069 return; 6070 auto MovI = Insn.begin(); 6071 MachineInstrBuilder MIB1; 6072 // MOV is an alias for one of three instructions: movz, movn, and orr. 6073 if (MovI->Opcode == OrrOpc) 6074 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR) 6075 .addReg(ZeroReg) 6076 .addImm(MovI->Op2); 6077 else { 6078 if (BitSize == 32) 6079 assert((MovI->Opcode == AArch64::MOVNWi || 6080 MovI->Opcode == AArch64::MOVZWi) && 6081 "Expected opcode"); 6082 else 6083 assert((MovI->Opcode == AArch64::MOVNXi || 6084 MovI->Opcode == AArch64::MOVZXi) && 6085 "Expected opcode"); 6086 MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR) 6087 .addImm(MovI->Op1) 6088 .addImm(MovI->Op2); 6089 } 6090 InsInstrs.push_back(MIB1); 6091 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6092 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 6093 break; 6094 } 6095 6096 case MachineCombinerPattern::MULADDv8i8_OP1: 6097 Opc = AArch64::MLAv8i8; 6098 RC = &AArch64::FPR64RegClass; 6099 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6100 break; 6101 case MachineCombinerPattern::MULADDv8i8_OP2: 6102 Opc = AArch64::MLAv8i8; 6103 RC = &AArch64::FPR64RegClass; 6104 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6105 break; 6106 case MachineCombinerPattern::MULADDv16i8_OP1: 6107 Opc = AArch64::MLAv16i8; 6108 RC = &AArch64::FPR128RegClass; 6109 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6110 break; 6111 case MachineCombinerPattern::MULADDv16i8_OP2: 6112 Opc = AArch64::MLAv16i8; 6113 RC = &AArch64::FPR128RegClass; 6114 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6115 break; 6116 case MachineCombinerPattern::MULADDv4i16_OP1: 6117 Opc = AArch64::MLAv4i16; 6118 RC = &AArch64::FPR64RegClass; 6119 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6120 break; 6121 case MachineCombinerPattern::MULADDv4i16_OP2: 6122 Opc = AArch64::MLAv4i16; 6123 RC = &AArch64::FPR64RegClass; 6124 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6125 break; 6126 case MachineCombinerPattern::MULADDv8i16_OP1: 6127 Opc = AArch64::MLAv8i16; 6128 RC = &AArch64::FPR128RegClass; 6129 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6130 break; 6131 case MachineCombinerPattern::MULADDv8i16_OP2: 6132 Opc = AArch64::MLAv8i16; 6133 RC = &AArch64::FPR128RegClass; 6134 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6135 break; 6136 case MachineCombinerPattern::MULADDv2i32_OP1: 6137 Opc = AArch64::MLAv2i32; 6138 RC = &AArch64::FPR64RegClass; 6139 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6140 break; 6141 case MachineCombinerPattern::MULADDv2i32_OP2: 6142 Opc = AArch64::MLAv2i32; 6143 RC = &AArch64::FPR64RegClass; 6144 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6145 break; 6146 case MachineCombinerPattern::MULADDv4i32_OP1: 6147 Opc = AArch64::MLAv4i32; 6148 RC = &AArch64::FPR128RegClass; 6149 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6150 break; 6151 case MachineCombinerPattern::MULADDv4i32_OP2: 6152 Opc = AArch64::MLAv4i32; 6153 RC = &AArch64::FPR128RegClass; 6154 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6155 break; 6156 6157 case MachineCombinerPattern::MULSUBv8i8_OP1: 6158 Opc = AArch64::MLAv8i8; 6159 RC = &AArch64::FPR64RegClass; 6160 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6161 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 6162 RC); 6163 break; 6164 case MachineCombinerPattern::MULSUBv8i8_OP2: 6165 Opc = AArch64::MLSv8i8; 6166 RC = &AArch64::FPR64RegClass; 6167 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6168 break; 6169 case MachineCombinerPattern::MULSUBv16i8_OP1: 6170 Opc = AArch64::MLAv16i8; 6171 RC = &AArch64::FPR128RegClass; 6172 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6173 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 6174 RC); 6175 break; 6176 case MachineCombinerPattern::MULSUBv16i8_OP2: 6177 Opc = AArch64::MLSv16i8; 6178 RC = &AArch64::FPR128RegClass; 6179 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6180 break; 6181 case MachineCombinerPattern::MULSUBv4i16_OP1: 6182 Opc = AArch64::MLAv4i16; 6183 RC = &AArch64::FPR64RegClass; 6184 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6185 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6186 RC); 6187 break; 6188 case MachineCombinerPattern::MULSUBv4i16_OP2: 6189 Opc = AArch64::MLSv4i16; 6190 RC = &AArch64::FPR64RegClass; 6191 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6192 break; 6193 case MachineCombinerPattern::MULSUBv8i16_OP1: 6194 Opc = AArch64::MLAv8i16; 6195 RC = &AArch64::FPR128RegClass; 6196 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6197 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6198 RC); 6199 break; 6200 case MachineCombinerPattern::MULSUBv8i16_OP2: 6201 Opc = AArch64::MLSv8i16; 6202 RC = &AArch64::FPR128RegClass; 6203 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6204 break; 6205 case MachineCombinerPattern::MULSUBv2i32_OP1: 6206 Opc = AArch64::MLAv2i32; 6207 RC = &AArch64::FPR64RegClass; 6208 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6209 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6210 RC); 6211 break; 6212 case MachineCombinerPattern::MULSUBv2i32_OP2: 6213 Opc = AArch64::MLSv2i32; 6214 RC = &AArch64::FPR64RegClass; 6215 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6216 break; 6217 case MachineCombinerPattern::MULSUBv4i32_OP1: 6218 Opc = AArch64::MLAv4i32; 6219 RC = &AArch64::FPR128RegClass; 6220 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 6221 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6222 RC); 6223 break; 6224 case MachineCombinerPattern::MULSUBv4i32_OP2: 6225 Opc = AArch64::MLSv4i32; 6226 RC = &AArch64::FPR128RegClass; 6227 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6228 break; 6229 6230 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 6231 Opc = AArch64::MLAv4i16_indexed; 6232 RC = &AArch64::FPR64RegClass; 6233 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6234 break; 6235 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 6236 Opc = AArch64::MLAv4i16_indexed; 6237 RC = &AArch64::FPR64RegClass; 6238 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6239 break; 6240 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 6241 Opc = AArch64::MLAv8i16_indexed; 6242 RC = &AArch64::FPR128RegClass; 6243 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6244 break; 6245 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 6246 Opc = AArch64::MLAv8i16_indexed; 6247 RC = &AArch64::FPR128RegClass; 6248 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6249 break; 6250 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 6251 Opc = AArch64::MLAv2i32_indexed; 6252 RC = &AArch64::FPR64RegClass; 6253 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6254 break; 6255 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 6256 Opc = AArch64::MLAv2i32_indexed; 6257 RC = &AArch64::FPR64RegClass; 6258 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6259 break; 6260 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 6261 Opc = AArch64::MLAv4i32_indexed; 6262 RC = &AArch64::FPR128RegClass; 6263 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6264 break; 6265 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 6266 Opc = AArch64::MLAv4i32_indexed; 6267 RC = &AArch64::FPR128RegClass; 6268 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6269 break; 6270 6271 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 6272 Opc = AArch64::MLAv4i16_indexed; 6273 RC = &AArch64::FPR64RegClass; 6274 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6275 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 6276 RC); 6277 break; 6278 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 6279 Opc = AArch64::MLSv4i16_indexed; 6280 RC = &AArch64::FPR64RegClass; 6281 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6282 break; 6283 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 6284 Opc = AArch64::MLAv8i16_indexed; 6285 RC = &AArch64::FPR128RegClass; 6286 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6287 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 6288 RC); 6289 break; 6290 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 6291 Opc = AArch64::MLSv8i16_indexed; 6292 RC = &AArch64::FPR128RegClass; 6293 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6294 break; 6295 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 6296 Opc = AArch64::MLAv2i32_indexed; 6297 RC = &AArch64::FPR64RegClass; 6298 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6299 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 6300 RC); 6301 break; 6302 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 6303 Opc = AArch64::MLSv2i32_indexed; 6304 RC = &AArch64::FPR64RegClass; 6305 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6306 break; 6307 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 6308 Opc = AArch64::MLAv4i32_indexed; 6309 RC = &AArch64::FPR128RegClass; 6310 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 6311 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 6312 RC); 6313 break; 6314 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 6315 Opc = AArch64::MLSv4i32_indexed; 6316 RC = &AArch64::FPR128RegClass; 6317 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6318 break; 6319 6320 // Floating Point Support 6321 case MachineCombinerPattern::FMULADDH_OP1: 6322 Opc = AArch64::FMADDHrrr; 6323 RC = &AArch64::FPR16RegClass; 6324 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6325 break; 6326 case MachineCombinerPattern::FMULADDS_OP1: 6327 Opc = AArch64::FMADDSrrr; 6328 RC = &AArch64::FPR32RegClass; 6329 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6330 break; 6331 case MachineCombinerPattern::FMULADDD_OP1: 6332 Opc = AArch64::FMADDDrrr; 6333 RC = &AArch64::FPR64RegClass; 6334 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6335 break; 6336 6337 case MachineCombinerPattern::FMULADDH_OP2: 6338 Opc = AArch64::FMADDHrrr; 6339 RC = &AArch64::FPR16RegClass; 6340 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6341 break; 6342 case MachineCombinerPattern::FMULADDS_OP2: 6343 Opc = AArch64::FMADDSrrr; 6344 RC = &AArch64::FPR32RegClass; 6345 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6346 break; 6347 case MachineCombinerPattern::FMULADDD_OP2: 6348 Opc = AArch64::FMADDDrrr; 6349 RC = &AArch64::FPR64RegClass; 6350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6351 break; 6352 6353 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 6354 Opc = AArch64::FMLAv1i32_indexed; 6355 RC = &AArch64::FPR32RegClass; 6356 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6357 FMAInstKind::Indexed); 6358 break; 6359 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 6360 Opc = AArch64::FMLAv1i32_indexed; 6361 RC = &AArch64::FPR32RegClass; 6362 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6363 FMAInstKind::Indexed); 6364 break; 6365 6366 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 6367 Opc = AArch64::FMLAv1i64_indexed; 6368 RC = &AArch64::FPR64RegClass; 6369 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6370 FMAInstKind::Indexed); 6371 break; 6372 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 6373 Opc = AArch64::FMLAv1i64_indexed; 6374 RC = &AArch64::FPR64RegClass; 6375 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6376 FMAInstKind::Indexed); 6377 break; 6378 6379 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 6380 RC = &AArch64::FPR64RegClass; 6381 Opc = AArch64::FMLAv4i16_indexed; 6382 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6383 FMAInstKind::Indexed); 6384 break; 6385 case MachineCombinerPattern::FMLAv4f16_OP1: 6386 RC = &AArch64::FPR64RegClass; 6387 Opc = AArch64::FMLAv4f16; 6388 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6389 FMAInstKind::Accumulator); 6390 break; 6391 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 6392 RC = &AArch64::FPR64RegClass; 6393 Opc = AArch64::FMLAv4i16_indexed; 6394 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6395 FMAInstKind::Indexed); 6396 break; 6397 case MachineCombinerPattern::FMLAv4f16_OP2: 6398 RC = &AArch64::FPR64RegClass; 6399 Opc = AArch64::FMLAv4f16; 6400 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6401 FMAInstKind::Accumulator); 6402 break; 6403 6404 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 6405 case MachineCombinerPattern::FMLAv2f32_OP1: 6406 RC = &AArch64::FPR64RegClass; 6407 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 6408 Opc = AArch64::FMLAv2i32_indexed; 6409 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6410 FMAInstKind::Indexed); 6411 } else { 6412 Opc = AArch64::FMLAv2f32; 6413 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6414 FMAInstKind::Accumulator); 6415 } 6416 break; 6417 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 6418 case MachineCombinerPattern::FMLAv2f32_OP2: 6419 RC = &AArch64::FPR64RegClass; 6420 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 6421 Opc = AArch64::FMLAv2i32_indexed; 6422 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6423 FMAInstKind::Indexed); 6424 } else { 6425 Opc = AArch64::FMLAv2f32; 6426 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6427 FMAInstKind::Accumulator); 6428 } 6429 break; 6430 6431 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 6432 RC = &AArch64::FPR128RegClass; 6433 Opc = AArch64::FMLAv8i16_indexed; 6434 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6435 FMAInstKind::Indexed); 6436 break; 6437 case MachineCombinerPattern::FMLAv8f16_OP1: 6438 RC = &AArch64::FPR128RegClass; 6439 Opc = AArch64::FMLAv8f16; 6440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6441 FMAInstKind::Accumulator); 6442 break; 6443 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 6444 RC = &AArch64::FPR128RegClass; 6445 Opc = AArch64::FMLAv8i16_indexed; 6446 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6447 FMAInstKind::Indexed); 6448 break; 6449 case MachineCombinerPattern::FMLAv8f16_OP2: 6450 RC = &AArch64::FPR128RegClass; 6451 Opc = AArch64::FMLAv8f16; 6452 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6453 FMAInstKind::Accumulator); 6454 break; 6455 6456 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 6457 case MachineCombinerPattern::FMLAv2f64_OP1: 6458 RC = &AArch64::FPR128RegClass; 6459 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 6460 Opc = AArch64::FMLAv2i64_indexed; 6461 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6462 FMAInstKind::Indexed); 6463 } else { 6464 Opc = AArch64::FMLAv2f64; 6465 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6466 FMAInstKind::Accumulator); 6467 } 6468 break; 6469 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 6470 case MachineCombinerPattern::FMLAv2f64_OP2: 6471 RC = &AArch64::FPR128RegClass; 6472 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 6473 Opc = AArch64::FMLAv2i64_indexed; 6474 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6475 FMAInstKind::Indexed); 6476 } else { 6477 Opc = AArch64::FMLAv2f64; 6478 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6479 FMAInstKind::Accumulator); 6480 } 6481 break; 6482 6483 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 6484 case MachineCombinerPattern::FMLAv4f32_OP1: 6485 RC = &AArch64::FPR128RegClass; 6486 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 6487 Opc = AArch64::FMLAv4i32_indexed; 6488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6489 FMAInstKind::Indexed); 6490 } else { 6491 Opc = AArch64::FMLAv4f32; 6492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6493 FMAInstKind::Accumulator); 6494 } 6495 break; 6496 6497 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 6498 case MachineCombinerPattern::FMLAv4f32_OP2: 6499 RC = &AArch64::FPR128RegClass; 6500 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 6501 Opc = AArch64::FMLAv4i32_indexed; 6502 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6503 FMAInstKind::Indexed); 6504 } else { 6505 Opc = AArch64::FMLAv4f32; 6506 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6507 FMAInstKind::Accumulator); 6508 } 6509 break; 6510 6511 case MachineCombinerPattern::FMULSUBH_OP1: 6512 Opc = AArch64::FNMSUBHrrr; 6513 RC = &AArch64::FPR16RegClass; 6514 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6515 break; 6516 case MachineCombinerPattern::FMULSUBS_OP1: 6517 Opc = AArch64::FNMSUBSrrr; 6518 RC = &AArch64::FPR32RegClass; 6519 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6520 break; 6521 case MachineCombinerPattern::FMULSUBD_OP1: 6522 Opc = AArch64::FNMSUBDrrr; 6523 RC = &AArch64::FPR64RegClass; 6524 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6525 break; 6526 6527 case MachineCombinerPattern::FNMULSUBH_OP1: 6528 Opc = AArch64::FNMADDHrrr; 6529 RC = &AArch64::FPR16RegClass; 6530 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6531 break; 6532 case MachineCombinerPattern::FNMULSUBS_OP1: 6533 Opc = AArch64::FNMADDSrrr; 6534 RC = &AArch64::FPR32RegClass; 6535 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6536 break; 6537 case MachineCombinerPattern::FNMULSUBD_OP1: 6538 Opc = AArch64::FNMADDDrrr; 6539 RC = &AArch64::FPR64RegClass; 6540 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 6541 break; 6542 6543 case MachineCombinerPattern::FMULSUBH_OP2: 6544 Opc = AArch64::FMSUBHrrr; 6545 RC = &AArch64::FPR16RegClass; 6546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6547 break; 6548 case MachineCombinerPattern::FMULSUBS_OP2: 6549 Opc = AArch64::FMSUBSrrr; 6550 RC = &AArch64::FPR32RegClass; 6551 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6552 break; 6553 case MachineCombinerPattern::FMULSUBD_OP2: 6554 Opc = AArch64::FMSUBDrrr; 6555 RC = &AArch64::FPR64RegClass; 6556 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 6557 break; 6558 6559 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 6560 Opc = AArch64::FMLSv1i32_indexed; 6561 RC = &AArch64::FPR32RegClass; 6562 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6563 FMAInstKind::Indexed); 6564 break; 6565 6566 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 6567 Opc = AArch64::FMLSv1i64_indexed; 6568 RC = &AArch64::FPR64RegClass; 6569 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6570 FMAInstKind::Indexed); 6571 break; 6572 6573 case MachineCombinerPattern::FMLSv4f16_OP1: 6574 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 6575 RC = &AArch64::FPR64RegClass; 6576 Register NewVR = MRI.createVirtualRegister(RC); 6577 MachineInstrBuilder MIB1 = 6578 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR) 6579 .add(Root.getOperand(2)); 6580 InsInstrs.push_back(MIB1); 6581 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6582 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 6583 Opc = AArch64::FMLAv4f16; 6584 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6585 FMAInstKind::Accumulator, &NewVR); 6586 } else { 6587 Opc = AArch64::FMLAv4i16_indexed; 6588 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6589 FMAInstKind::Indexed, &NewVR); 6590 } 6591 break; 6592 } 6593 case MachineCombinerPattern::FMLSv4f16_OP2: 6594 RC = &AArch64::FPR64RegClass; 6595 Opc = AArch64::FMLSv4f16; 6596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6597 FMAInstKind::Accumulator); 6598 break; 6599 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 6600 RC = &AArch64::FPR64RegClass; 6601 Opc = AArch64::FMLSv4i16_indexed; 6602 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6603 FMAInstKind::Indexed); 6604 break; 6605 6606 case MachineCombinerPattern::FMLSv2f32_OP2: 6607 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 6608 RC = &AArch64::FPR64RegClass; 6609 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 6610 Opc = AArch64::FMLSv2i32_indexed; 6611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6612 FMAInstKind::Indexed); 6613 } else { 6614 Opc = AArch64::FMLSv2f32; 6615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6616 FMAInstKind::Accumulator); 6617 } 6618 break; 6619 6620 case MachineCombinerPattern::FMLSv8f16_OP1: 6621 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 6622 RC = &AArch64::FPR128RegClass; 6623 Register NewVR = MRI.createVirtualRegister(RC); 6624 MachineInstrBuilder MIB1 = 6625 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR) 6626 .add(Root.getOperand(2)); 6627 InsInstrs.push_back(MIB1); 6628 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6629 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 6630 Opc = AArch64::FMLAv8f16; 6631 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6632 FMAInstKind::Accumulator, &NewVR); 6633 } else { 6634 Opc = AArch64::FMLAv8i16_indexed; 6635 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6636 FMAInstKind::Indexed, &NewVR); 6637 } 6638 break; 6639 } 6640 case MachineCombinerPattern::FMLSv8f16_OP2: 6641 RC = &AArch64::FPR128RegClass; 6642 Opc = AArch64::FMLSv8f16; 6643 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6644 FMAInstKind::Accumulator); 6645 break; 6646 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 6647 RC = &AArch64::FPR128RegClass; 6648 Opc = AArch64::FMLSv8i16_indexed; 6649 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6650 FMAInstKind::Indexed); 6651 break; 6652 6653 case MachineCombinerPattern::FMLSv2f64_OP2: 6654 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 6655 RC = &AArch64::FPR128RegClass; 6656 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 6657 Opc = AArch64::FMLSv2i64_indexed; 6658 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6659 FMAInstKind::Indexed); 6660 } else { 6661 Opc = AArch64::FMLSv2f64; 6662 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6663 FMAInstKind::Accumulator); 6664 } 6665 break; 6666 6667 case MachineCombinerPattern::FMLSv4f32_OP2: 6668 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6669 RC = &AArch64::FPR128RegClass; 6670 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 6671 Opc = AArch64::FMLSv4i32_indexed; 6672 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6673 FMAInstKind::Indexed); 6674 } else { 6675 Opc = AArch64::FMLSv4f32; 6676 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6677 FMAInstKind::Accumulator); 6678 } 6679 break; 6680 case MachineCombinerPattern::FMLSv2f32_OP1: 6681 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 6682 RC = &AArch64::FPR64RegClass; 6683 Register NewVR = MRI.createVirtualRegister(RC); 6684 MachineInstrBuilder MIB1 = 6685 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR) 6686 .add(Root.getOperand(2)); 6687 InsInstrs.push_back(MIB1); 6688 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6689 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 6690 Opc = AArch64::FMLAv2i32_indexed; 6691 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6692 FMAInstKind::Indexed, &NewVR); 6693 } else { 6694 Opc = AArch64::FMLAv2f32; 6695 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6696 FMAInstKind::Accumulator, &NewVR); 6697 } 6698 break; 6699 } 6700 case MachineCombinerPattern::FMLSv4f32_OP1: 6701 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 6702 RC = &AArch64::FPR128RegClass; 6703 Register NewVR = MRI.createVirtualRegister(RC); 6704 MachineInstrBuilder MIB1 = 6705 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR) 6706 .add(Root.getOperand(2)); 6707 InsInstrs.push_back(MIB1); 6708 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6709 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 6710 Opc = AArch64::FMLAv4i32_indexed; 6711 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6712 FMAInstKind::Indexed, &NewVR); 6713 } else { 6714 Opc = AArch64::FMLAv4f32; 6715 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6716 FMAInstKind::Accumulator, &NewVR); 6717 } 6718 break; 6719 } 6720 case MachineCombinerPattern::FMLSv2f64_OP1: 6721 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6722 RC = &AArch64::FPR128RegClass; 6723 Register NewVR = MRI.createVirtualRegister(RC); 6724 MachineInstrBuilder MIB1 = 6725 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR) 6726 .add(Root.getOperand(2)); 6727 InsInstrs.push_back(MIB1); 6728 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6729 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6730 Opc = AArch64::FMLAv2i64_indexed; 6731 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6732 FMAInstKind::Indexed, &NewVR); 6733 } else { 6734 Opc = AArch64::FMLAv2f64; 6735 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6736 FMAInstKind::Accumulator, &NewVR); 6737 } 6738 break; 6739 } 6740 case MachineCombinerPattern::FMULv2i32_indexed_OP1: 6741 case MachineCombinerPattern::FMULv2i32_indexed_OP2: { 6742 unsigned IdxDupOp = 6743 (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2; 6744 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed, 6745 &AArch64::FPR128RegClass, MRI); 6746 break; 6747 } 6748 case MachineCombinerPattern::FMULv2i64_indexed_OP1: 6749 case MachineCombinerPattern::FMULv2i64_indexed_OP2: { 6750 unsigned IdxDupOp = 6751 (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2; 6752 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed, 6753 &AArch64::FPR128RegClass, MRI); 6754 break; 6755 } 6756 case MachineCombinerPattern::FMULv4i16_indexed_OP1: 6757 case MachineCombinerPattern::FMULv4i16_indexed_OP2: { 6758 unsigned IdxDupOp = 6759 (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2; 6760 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed, 6761 &AArch64::FPR128_loRegClass, MRI); 6762 break; 6763 } 6764 case MachineCombinerPattern::FMULv4i32_indexed_OP1: 6765 case MachineCombinerPattern::FMULv4i32_indexed_OP2: { 6766 unsigned IdxDupOp = 6767 (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2; 6768 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed, 6769 &AArch64::FPR128RegClass, MRI); 6770 break; 6771 } 6772 case MachineCombinerPattern::FMULv8i16_indexed_OP1: 6773 case MachineCombinerPattern::FMULv8i16_indexed_OP2: { 6774 unsigned IdxDupOp = 6775 (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2; 6776 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed, 6777 &AArch64::FPR128_loRegClass, MRI); 6778 break; 6779 } 6780 } // end switch (Pattern) 6781 // Record MUL and ADD/SUB for deletion 6782 if (MUL) 6783 DelInstrs.push_back(MUL); 6784 DelInstrs.push_back(&Root); 6785 6786 // Set the flags on the inserted instructions to be the merged flags of the 6787 // instructions that we have combined. 6788 uint16_t Flags = Root.getFlags(); 6789 if (MUL) 6790 Flags = Root.mergeFlagsWith(*MUL); 6791 for (auto *MI : InsInstrs) 6792 MI->setFlags(Flags); 6793 } 6794 6795 /// Replace csincr-branch sequence by simple conditional branch 6796 /// 6797 /// Examples: 6798 /// 1. \code 6799 /// csinc w9, wzr, wzr, <condition code> 6800 /// tbnz w9, #0, 0x44 6801 /// \endcode 6802 /// to 6803 /// \code 6804 /// b.<inverted condition code> 6805 /// \endcode 6806 /// 6807 /// 2. \code 6808 /// csinc w9, wzr, wzr, <condition code> 6809 /// tbz w9, #0, 0x44 6810 /// \endcode 6811 /// to 6812 /// \code 6813 /// b.<condition code> 6814 /// \endcode 6815 /// 6816 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6817 /// compare's constant operand is power of 2. 6818 /// 6819 /// Examples: 6820 /// \code 6821 /// and w8, w8, #0x400 6822 /// cbnz w8, L1 6823 /// \endcode 6824 /// to 6825 /// \code 6826 /// tbnz w8, #10, L1 6827 /// \endcode 6828 /// 6829 /// \param MI Conditional Branch 6830 /// \return True when the simple conditional branch is generated 6831 /// 6832 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6833 bool IsNegativeBranch = false; 6834 bool IsTestAndBranch = false; 6835 unsigned TargetBBInMI = 0; 6836 switch (MI.getOpcode()) { 6837 default: 6838 llvm_unreachable("Unknown branch instruction?"); 6839 case AArch64::Bcc: 6840 return false; 6841 case AArch64::CBZW: 6842 case AArch64::CBZX: 6843 TargetBBInMI = 1; 6844 break; 6845 case AArch64::CBNZW: 6846 case AArch64::CBNZX: 6847 TargetBBInMI = 1; 6848 IsNegativeBranch = true; 6849 break; 6850 case AArch64::TBZW: 6851 case AArch64::TBZX: 6852 TargetBBInMI = 2; 6853 IsTestAndBranch = true; 6854 break; 6855 case AArch64::TBNZW: 6856 case AArch64::TBNZX: 6857 TargetBBInMI = 2; 6858 IsNegativeBranch = true; 6859 IsTestAndBranch = true; 6860 break; 6861 } 6862 // So we increment a zero register and test for bits other 6863 // than bit 0? Conservatively bail out in case the verifier 6864 // missed this case. 6865 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6866 return false; 6867 6868 // Find Definition. 6869 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6870 MachineBasicBlock *MBB = MI.getParent(); 6871 MachineFunction *MF = MBB->getParent(); 6872 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6873 Register VReg = MI.getOperand(0).getReg(); 6874 if (!VReg.isVirtual()) 6875 return false; 6876 6877 MachineInstr *DefMI = MRI->getVRegDef(VReg); 6878 6879 // Look through COPY instructions to find definition. 6880 while (DefMI->isCopy()) { 6881 Register CopyVReg = DefMI->getOperand(1).getReg(); 6882 if (!MRI->hasOneNonDBGUse(CopyVReg)) 6883 return false; 6884 if (!MRI->hasOneDef(CopyVReg)) 6885 return false; 6886 DefMI = MRI->getVRegDef(CopyVReg); 6887 } 6888 6889 switch (DefMI->getOpcode()) { 6890 default: 6891 return false; 6892 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 6893 case AArch64::ANDWri: 6894 case AArch64::ANDXri: { 6895 if (IsTestAndBranch) 6896 return false; 6897 if (DefMI->getParent() != MBB) 6898 return false; 6899 if (!MRI->hasOneNonDBGUse(VReg)) 6900 return false; 6901 6902 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 6903 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 6904 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 6905 if (!isPowerOf2_64(Mask)) 6906 return false; 6907 6908 MachineOperand &MO = DefMI->getOperand(1); 6909 Register NewReg = MO.getReg(); 6910 if (!NewReg.isVirtual()) 6911 return false; 6912 6913 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 6914 6915 MachineBasicBlock &RefToMBB = *MBB; 6916 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 6917 DebugLoc DL = MI.getDebugLoc(); 6918 unsigned Imm = Log2_64(Mask); 6919 unsigned Opc = (Imm < 32) 6920 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 6921 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 6922 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 6923 .addReg(NewReg) 6924 .addImm(Imm) 6925 .addMBB(TBB); 6926 // Register lives on to the CBZ now. 6927 MO.setIsKill(false); 6928 6929 // For immediate smaller than 32, we need to use the 32-bit 6930 // variant (W) in all cases. Indeed the 64-bit variant does not 6931 // allow to encode them. 6932 // Therefore, if the input register is 64-bit, we need to take the 6933 // 32-bit sub-part. 6934 if (!Is32Bit && Imm < 32) 6935 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 6936 MI.eraseFromParent(); 6937 return true; 6938 } 6939 // Look for CSINC 6940 case AArch64::CSINCWr: 6941 case AArch64::CSINCXr: { 6942 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 6943 DefMI->getOperand(2).getReg() == AArch64::WZR) && 6944 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 6945 DefMI->getOperand(2).getReg() == AArch64::XZR)) 6946 return false; 6947 6948 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 6949 return false; 6950 6951 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 6952 // Convert only when the condition code is not modified between 6953 // the CSINC and the branch. The CC may be used by other 6954 // instructions in between. 6955 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 6956 return false; 6957 MachineBasicBlock &RefToMBB = *MBB; 6958 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 6959 DebugLoc DL = MI.getDebugLoc(); 6960 if (IsNegativeBranch) 6961 CC = AArch64CC::getInvertedCondCode(CC); 6962 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 6963 MI.eraseFromParent(); 6964 return true; 6965 } 6966 } 6967 } 6968 6969 std::pair<unsigned, unsigned> 6970 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6971 const unsigned Mask = AArch64II::MO_FRAGMENT; 6972 return std::make_pair(TF & Mask, TF & ~Mask); 6973 } 6974 6975 ArrayRef<std::pair<unsigned, const char *>> 6976 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6977 using namespace AArch64II; 6978 6979 static const std::pair<unsigned, const char *> TargetFlags[] = { 6980 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 6981 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 6982 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 6983 {MO_HI12, "aarch64-hi12"}}; 6984 return ArrayRef(TargetFlags); 6985 } 6986 6987 ArrayRef<std::pair<unsigned, const char *>> 6988 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 6989 using namespace AArch64II; 6990 6991 static const std::pair<unsigned, const char *> TargetFlags[] = { 6992 {MO_COFFSTUB, "aarch64-coffstub"}, 6993 {MO_GOT, "aarch64-got"}, 6994 {MO_NC, "aarch64-nc"}, 6995 {MO_S, "aarch64-s"}, 6996 {MO_TLS, "aarch64-tls"}, 6997 {MO_DLLIMPORT, "aarch64-dllimport"}, 6998 {MO_DLLIMPORTAUX, "aarch64-dllimportaux"}, 6999 {MO_PREL, "aarch64-prel"}, 7000 {MO_TAGGED, "aarch64-tagged"}}; 7001 return ArrayRef(TargetFlags); 7002 } 7003 7004 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 7005 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 7006 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 7007 {{MOSuppressPair, "aarch64-suppress-pair"}, 7008 {MOStridedAccess, "aarch64-strided-access"}}; 7009 return ArrayRef(TargetFlags); 7010 } 7011 7012 /// Constants defining how certain sequences should be outlined. 7013 /// This encompasses how an outlined function should be called, and what kind of 7014 /// frame should be emitted for that outlined function. 7015 /// 7016 /// \p MachineOutlinerDefault implies that the function should be called with 7017 /// a save and restore of LR to the stack. 7018 /// 7019 /// That is, 7020 /// 7021 /// I1 Save LR OUTLINED_FUNCTION: 7022 /// I2 --> BL OUTLINED_FUNCTION I1 7023 /// I3 Restore LR I2 7024 /// I3 7025 /// RET 7026 /// 7027 /// * Call construction overhead: 3 (save + BL + restore) 7028 /// * Frame construction overhead: 1 (ret) 7029 /// * Requires stack fixups? Yes 7030 /// 7031 /// \p MachineOutlinerTailCall implies that the function is being created from 7032 /// a sequence of instructions ending in a return. 7033 /// 7034 /// That is, 7035 /// 7036 /// I1 OUTLINED_FUNCTION: 7037 /// I2 --> B OUTLINED_FUNCTION I1 7038 /// RET I2 7039 /// RET 7040 /// 7041 /// * Call construction overhead: 1 (B) 7042 /// * Frame construction overhead: 0 (Return included in sequence) 7043 /// * Requires stack fixups? No 7044 /// 7045 /// \p MachineOutlinerNoLRSave implies that the function should be called using 7046 /// a BL instruction, but doesn't require LR to be saved and restored. This 7047 /// happens when LR is known to be dead. 7048 /// 7049 /// That is, 7050 /// 7051 /// I1 OUTLINED_FUNCTION: 7052 /// I2 --> BL OUTLINED_FUNCTION I1 7053 /// I3 I2 7054 /// I3 7055 /// RET 7056 /// 7057 /// * Call construction overhead: 1 (BL) 7058 /// * Frame construction overhead: 1 (RET) 7059 /// * Requires stack fixups? No 7060 /// 7061 /// \p MachineOutlinerThunk implies that the function is being created from 7062 /// a sequence of instructions ending in a call. The outlined function is 7063 /// called with a BL instruction, and the outlined function tail-calls the 7064 /// original call destination. 7065 /// 7066 /// That is, 7067 /// 7068 /// I1 OUTLINED_FUNCTION: 7069 /// I2 --> BL OUTLINED_FUNCTION I1 7070 /// BL f I2 7071 /// B f 7072 /// * Call construction overhead: 1 (BL) 7073 /// * Frame construction overhead: 0 7074 /// * Requires stack fixups? No 7075 /// 7076 /// \p MachineOutlinerRegSave implies that the function should be called with a 7077 /// save and restore of LR to an available register. This allows us to avoid 7078 /// stack fixups. Note that this outlining variant is compatible with the 7079 /// NoLRSave case. 7080 /// 7081 /// That is, 7082 /// 7083 /// I1 Save LR OUTLINED_FUNCTION: 7084 /// I2 --> BL OUTLINED_FUNCTION I1 7085 /// I3 Restore LR I2 7086 /// I3 7087 /// RET 7088 /// 7089 /// * Call construction overhead: 3 (save + BL + restore) 7090 /// * Frame construction overhead: 1 (ret) 7091 /// * Requires stack fixups? No 7092 enum MachineOutlinerClass { 7093 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 7094 MachineOutlinerTailCall, /// Only emit a branch. 7095 MachineOutlinerNoLRSave, /// Emit a call and return. 7096 MachineOutlinerThunk, /// Emit a call and tail-call. 7097 MachineOutlinerRegSave /// Same as default, but save to a register. 7098 }; 7099 7100 enum MachineOutlinerMBBFlags { 7101 LRUnavailableSomewhere = 0x2, 7102 HasCalls = 0x4, 7103 UnsafeRegsDead = 0x8 7104 }; 7105 7106 Register 7107 AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const { 7108 MachineFunction *MF = C.getMF(); 7109 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); 7110 const AArch64RegisterInfo *ARI = 7111 static_cast<const AArch64RegisterInfo *>(&TRI); 7112 // Check if there is an available register across the sequence that we can 7113 // use. 7114 for (unsigned Reg : AArch64::GPR64RegClass) { 7115 if (!ARI->isReservedReg(*MF, Reg) && 7116 Reg != AArch64::LR && // LR is not reserved, but don't use it. 7117 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 7118 Reg != AArch64::X17 && // Ditto for X17. 7119 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) && 7120 C.isAvailableInsideSeq(Reg, TRI)) 7121 return Reg; 7122 } 7123 return Register(); 7124 } 7125 7126 static bool 7127 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 7128 const outliner::Candidate &b) { 7129 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7130 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7131 7132 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 7133 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 7134 } 7135 7136 static bool 7137 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 7138 const outliner::Candidate &b) { 7139 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 7140 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 7141 7142 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 7143 } 7144 7145 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 7146 const outliner::Candidate &b) { 7147 const AArch64Subtarget &SubtargetA = 7148 a.getMF()->getSubtarget<AArch64Subtarget>(); 7149 const AArch64Subtarget &SubtargetB = 7150 b.getMF()->getSubtarget<AArch64Subtarget>(); 7151 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 7152 } 7153 7154 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 7155 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 7156 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 7157 unsigned SequenceSize = 7158 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 7159 [this](unsigned Sum, const MachineInstr &MI) { 7160 return Sum + getInstSizeInBytes(MI); 7161 }); 7162 unsigned NumBytesToCreateFrame = 0; 7163 7164 // We only allow outlining for functions having exactly matching return 7165 // address signing attributes, i.e., all share the same value for the 7166 // attribute "sign-return-address" and all share the same type of key they 7167 // are signed with. 7168 // Additionally we require all functions to simultaniously either support 7169 // v8.3a features or not. Otherwise an outlined function could get signed 7170 // using dedicated v8.3 instructions and a call from a function that doesn't 7171 // support v8.3 instructions would therefore be invalid. 7172 if (std::adjacent_find( 7173 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 7174 [](const outliner::Candidate &a, const outliner::Candidate &b) { 7175 // Return true if a and b are non-equal w.r.t. return address 7176 // signing or support of v8.3a features 7177 if (outliningCandidatesSigningScopeConsensus(a, b) && 7178 outliningCandidatesSigningKeyConsensus(a, b) && 7179 outliningCandidatesV8_3OpsConsensus(a, b)) { 7180 return false; 7181 } 7182 return true; 7183 }) != RepeatedSequenceLocs.end()) { 7184 return outliner::OutlinedFunction(); 7185 } 7186 7187 // Since at this point all candidates agree on their return address signing 7188 // picking just one is fine. If the candidate functions potentially sign their 7189 // return addresses, the outlined function should do the same. Note that in 7190 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 7191 // not certainly true that the outlined function will have to sign its return 7192 // address but this decision is made later, when the decision to outline 7193 // has already been made. 7194 // The same holds for the number of additional instructions we need: On 7195 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 7196 // necessary. However, at this point we don't know if the outlined function 7197 // will have a RET instruction so we assume the worst. 7198 const TargetRegisterInfo &TRI = getRegisterInfo(); 7199 if (FirstCand.getMF() 7200 ->getInfo<AArch64FunctionInfo>() 7201 ->shouldSignReturnAddress(true)) { 7202 // One PAC and one AUT instructions 7203 NumBytesToCreateFrame += 8; 7204 7205 // We have to check if sp modifying instructions would get outlined. 7206 // If so we only allow outlining if sp is unchanged overall, so matching 7207 // sub and add instructions are okay to outline, all other sp modifications 7208 // are not 7209 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 7210 int SPValue = 0; 7211 MachineBasicBlock::iterator MBBI = C.front(); 7212 for (;;) { 7213 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 7214 switch (MBBI->getOpcode()) { 7215 case AArch64::ADDXri: 7216 case AArch64::ADDWri: 7217 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7218 assert(MBBI->getOperand(2).isImm() && 7219 "Expected operand to be immediate"); 7220 assert(MBBI->getOperand(1).isReg() && 7221 "Expected operand to be a register"); 7222 // Check if the add just increments sp. If so, we search for 7223 // matching sub instructions that decrement sp. If not, the 7224 // modification is illegal 7225 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7226 SPValue += MBBI->getOperand(2).getImm(); 7227 else 7228 return true; 7229 break; 7230 case AArch64::SUBXri: 7231 case AArch64::SUBWri: 7232 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 7233 assert(MBBI->getOperand(2).isImm() && 7234 "Expected operand to be immediate"); 7235 assert(MBBI->getOperand(1).isReg() && 7236 "Expected operand to be a register"); 7237 // Check if the sub just decrements sp. If so, we search for 7238 // matching add instructions that increment sp. If not, the 7239 // modification is illegal 7240 if (MBBI->getOperand(1).getReg() == AArch64::SP) 7241 SPValue -= MBBI->getOperand(2).getImm(); 7242 else 7243 return true; 7244 break; 7245 default: 7246 return true; 7247 } 7248 } 7249 if (MBBI == C.back()) 7250 break; 7251 ++MBBI; 7252 } 7253 if (SPValue) 7254 return true; 7255 return false; 7256 }; 7257 // Remove candidates with illegal stack modifying instructions 7258 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 7259 7260 // If the sequence doesn't have enough candidates left, then we're done. 7261 if (RepeatedSequenceLocs.size() < 2) 7262 return outliner::OutlinedFunction(); 7263 } 7264 7265 // Properties about candidate MBBs that hold for all of them. 7266 unsigned FlagsSetInAll = 0xF; 7267 7268 // Compute liveness information for each candidate, and set FlagsSetInAll. 7269 for (outliner::Candidate &C : RepeatedSequenceLocs) 7270 FlagsSetInAll &= C.Flags; 7271 7272 // According to the AArch64 Procedure Call Standard, the following are 7273 // undefined on entry/exit from a function call: 7274 // 7275 // * Registers x16, x17, (and thus w16, w17) 7276 // * Condition codes (and thus the NZCV register) 7277 // 7278 // Because if this, we can't outline any sequence of instructions where 7279 // one 7280 // of these registers is live into/across it. Thus, we need to delete 7281 // those 7282 // candidates. 7283 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 7284 // If the unsafe registers in this block are all dead, then we don't need 7285 // to compute liveness here. 7286 if (C.Flags & UnsafeRegsDead) 7287 return false; 7288 return C.isAnyUnavailableAcrossOrOutOfSeq( 7289 {AArch64::W16, AArch64::W17, AArch64::NZCV}, TRI); 7290 }; 7291 7292 // Are there any candidates where those registers are live? 7293 if (!(FlagsSetInAll & UnsafeRegsDead)) { 7294 // Erase every candidate that violates the restrictions above. (It could be 7295 // true that we have viable candidates, so it's not worth bailing out in 7296 // the case that, say, 1 out of 20 candidates violate the restructions.) 7297 llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); 7298 7299 // If the sequence doesn't have enough candidates left, then we're done. 7300 if (RepeatedSequenceLocs.size() < 2) 7301 return outliner::OutlinedFunction(); 7302 } 7303 7304 // At this point, we have only "safe" candidates to outline. Figure out 7305 // frame + call instruction information. 7306 7307 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 7308 7309 // Helper lambda which sets call information for every candidate. 7310 auto SetCandidateCallInfo = 7311 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 7312 for (outliner::Candidate &C : RepeatedSequenceLocs) 7313 C.setCallInfo(CallID, NumBytesForCall); 7314 }; 7315 7316 unsigned FrameID = MachineOutlinerDefault; 7317 NumBytesToCreateFrame += 4; 7318 7319 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 7320 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 7321 }); 7322 7323 // We check to see if CFI Instructions are present, and if they are 7324 // we find the number of CFI Instructions in the candidates. 7325 unsigned CFICount = 0; 7326 for (auto &I : make_range(RepeatedSequenceLocs[0].front(), 7327 std::next(RepeatedSequenceLocs[0].back()))) { 7328 if (I.isCFIInstruction()) 7329 CFICount++; 7330 } 7331 7332 // We compare the number of found CFI Instructions to the number of CFI 7333 // instructions in the parent function for each candidate. We must check this 7334 // since if we outline one of the CFI instructions in a function, we have to 7335 // outline them all for correctness. If we do not, the address offsets will be 7336 // incorrect between the two sections of the program. 7337 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7338 std::vector<MCCFIInstruction> CFIInstructions = 7339 C.getMF()->getFrameInstructions(); 7340 7341 if (CFICount > 0 && CFICount != CFIInstructions.size()) 7342 return outliner::OutlinedFunction(); 7343 } 7344 7345 // Returns true if an instructions is safe to fix up, false otherwise. 7346 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 7347 if (MI.isCall()) 7348 return true; 7349 7350 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 7351 !MI.readsRegister(AArch64::SP, &TRI)) 7352 return true; 7353 7354 // Any modification of SP will break our code to save/restore LR. 7355 // FIXME: We could handle some instructions which add a constant 7356 // offset to SP, with a bit more work. 7357 if (MI.modifiesRegister(AArch64::SP, &TRI)) 7358 return false; 7359 7360 // At this point, we have a stack instruction that we might need to 7361 // fix up. We'll handle it if it's a load or store. 7362 if (MI.mayLoadOrStore()) { 7363 const MachineOperand *Base; // Filled with the base operand of MI. 7364 int64_t Offset; // Filled with the offset of MI. 7365 bool OffsetIsScalable; 7366 7367 // Does it allow us to offset the base operand and is the base the 7368 // register SP? 7369 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 7370 !Base->isReg() || Base->getReg() != AArch64::SP) 7371 return false; 7372 7373 // Fixe-up code below assumes bytes. 7374 if (OffsetIsScalable) 7375 return false; 7376 7377 // Find the minimum/maximum offset for this instruction and check 7378 // if fixing it up would be in range. 7379 int64_t MinOffset, 7380 MaxOffset; // Unscaled offsets for the instruction. 7381 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 7382 unsigned DummyWidth; 7383 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 7384 7385 Offset += 16; // Update the offset to what it would be if we outlined. 7386 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() || 7387 Offset > MaxOffset * (int64_t)Scale.getFixedValue()) 7388 return false; 7389 7390 // It's in range, so we can outline it. 7391 return true; 7392 } 7393 7394 // FIXME: Add handling for instructions like "add x0, sp, #8". 7395 7396 // We can't fix it up, so don't outline it. 7397 return false; 7398 }; 7399 7400 // True if it's possible to fix up each stack instruction in this sequence. 7401 // Important for frames/call variants that modify the stack. 7402 bool AllStackInstrsSafe = std::all_of( 7403 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 7404 7405 // If the last instruction in any candidate is a terminator, then we should 7406 // tail call all of the candidates. 7407 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 7408 FrameID = MachineOutlinerTailCall; 7409 NumBytesToCreateFrame = 0; 7410 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 7411 } 7412 7413 else if (LastInstrOpcode == AArch64::BL || 7414 ((LastInstrOpcode == AArch64::BLR || 7415 LastInstrOpcode == AArch64::BLRNoIP) && 7416 !HasBTI)) { 7417 // FIXME: Do we need to check if the code after this uses the value of LR? 7418 FrameID = MachineOutlinerThunk; 7419 NumBytesToCreateFrame = 0; 7420 SetCandidateCallInfo(MachineOutlinerThunk, 4); 7421 } 7422 7423 else { 7424 // We need to decide how to emit calls + frames. We can always emit the same 7425 // frame if we don't need to save to the stack. If we have to save to the 7426 // stack, then we need a different frame. 7427 unsigned NumBytesNoStackCalls = 0; 7428 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 7429 7430 // Check if we have to save LR. 7431 for (outliner::Candidate &C : RepeatedSequenceLocs) { 7432 // If we have a noreturn caller, then we're going to be conservative and 7433 // say that we have to save LR. If we don't have a ret at the end of the 7434 // block, then we can't reason about liveness accurately. 7435 // 7436 // FIXME: We can probably do better than always disabling this in 7437 // noreturn functions by fixing up the liveness info. 7438 bool IsNoReturn = 7439 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 7440 7441 // Is LR available? If so, we don't need a save. 7442 if (C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) && !IsNoReturn) { 7443 NumBytesNoStackCalls += 4; 7444 C.setCallInfo(MachineOutlinerNoLRSave, 4); 7445 CandidatesWithoutStackFixups.push_back(C); 7446 } 7447 7448 // Is an unused register available? If so, we won't modify the stack, so 7449 // we can outline with the same frame type as those that don't save LR. 7450 else if (findRegisterToSaveLRTo(C)) { 7451 NumBytesNoStackCalls += 12; 7452 C.setCallInfo(MachineOutlinerRegSave, 12); 7453 CandidatesWithoutStackFixups.push_back(C); 7454 } 7455 7456 // Is SP used in the sequence at all? If not, we don't have to modify 7457 // the stack, so we are guaranteed to get the same frame. 7458 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) { 7459 NumBytesNoStackCalls += 12; 7460 C.setCallInfo(MachineOutlinerDefault, 12); 7461 CandidatesWithoutStackFixups.push_back(C); 7462 } 7463 7464 // If we outline this, we need to modify the stack. Pretend we don't 7465 // outline this by saving all of its bytes. 7466 else { 7467 NumBytesNoStackCalls += SequenceSize; 7468 } 7469 } 7470 7471 // If there are no places where we have to save LR, then note that we 7472 // don't have to update the stack. Otherwise, give every candidate the 7473 // default call type, as long as it's safe to do so. 7474 if (!AllStackInstrsSafe || 7475 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 7476 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 7477 FrameID = MachineOutlinerNoLRSave; 7478 } else { 7479 SetCandidateCallInfo(MachineOutlinerDefault, 12); 7480 7481 // Bugzilla ID: 46767 7482 // TODO: Check if fixing up the stack more than once is safe so we can 7483 // outline these. 7484 // 7485 // An outline resulting in a caller that requires stack fixups at the 7486 // callsite to a callee that also requires stack fixups can happen when 7487 // there are no available registers at the candidate callsite for a 7488 // candidate that itself also has calls. 7489 // 7490 // In other words if function_containing_sequence in the following pseudo 7491 // assembly requires that we save LR at the point of the call, but there 7492 // are no available registers: in this case we save using SP and as a 7493 // result the SP offsets requires stack fixups by multiples of 16. 7494 // 7495 // function_containing_sequence: 7496 // ... 7497 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7498 // call OUTLINED_FUNCTION_N 7499 // restore LR from SP 7500 // ... 7501 // 7502 // OUTLINED_FUNCTION_N: 7503 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 7504 // ... 7505 // bl foo 7506 // restore LR from SP 7507 // ret 7508 // 7509 // Because the code to handle more than one stack fixup does not 7510 // currently have the proper checks for legality, these cases will assert 7511 // in the AArch64 MachineOutliner. This is because the code to do this 7512 // needs more hardening, testing, better checks that generated code is 7513 // legal, etc and because it is only verified to handle a single pass of 7514 // stack fixup. 7515 // 7516 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 7517 // these cases until they are known to be handled. Bugzilla 46767 is 7518 // referenced in comments at the assert site. 7519 // 7520 // To avoid asserting (or generating non-legal code on noassert builds) 7521 // we remove all candidates which would need more than one stack fixup by 7522 // pruning the cases where the candidate has calls while also having no 7523 // available LR and having no available general purpose registers to copy 7524 // LR to (ie one extra stack save/restore). 7525 // 7526 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7527 erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) { 7528 return (std::any_of( 7529 C.front(), std::next(C.back()), 7530 [](const MachineInstr &MI) { return MI.isCall(); })) && 7531 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) || 7532 !findRegisterToSaveLRTo(C)); 7533 }); 7534 } 7535 } 7536 7537 // If we dropped all of the candidates, bail out here. 7538 if (RepeatedSequenceLocs.size() < 2) { 7539 RepeatedSequenceLocs.clear(); 7540 return outliner::OutlinedFunction(); 7541 } 7542 } 7543 7544 // Does every candidate's MBB contain a call? If so, then we might have a call 7545 // in the range. 7546 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 7547 // Check if the range contains a call. These require a save + restore of the 7548 // link register. 7549 bool ModStackToSaveLR = false; 7550 if (std::any_of(FirstCand.front(), FirstCand.back(), 7551 [](const MachineInstr &MI) { return MI.isCall(); })) 7552 ModStackToSaveLR = true; 7553 7554 // Handle the last instruction separately. If this is a tail call, then the 7555 // last instruction is a call. We don't want to save + restore in this case. 7556 // However, it could be possible that the last instruction is a call without 7557 // it being valid to tail call this sequence. We should consider this as 7558 // well. 7559 else if (FrameID != MachineOutlinerThunk && 7560 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 7561 ModStackToSaveLR = true; 7562 7563 if (ModStackToSaveLR) { 7564 // We can't fix up the stack. Bail out. 7565 if (!AllStackInstrsSafe) { 7566 RepeatedSequenceLocs.clear(); 7567 return outliner::OutlinedFunction(); 7568 } 7569 7570 // Save + restore LR. 7571 NumBytesToCreateFrame += 8; 7572 } 7573 } 7574 7575 // If we have CFI instructions, we can only outline if the outlined section 7576 // can be a tail call 7577 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 7578 return outliner::OutlinedFunction(); 7579 7580 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 7581 NumBytesToCreateFrame, FrameID); 7582 } 7583 7584 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 7585 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 7586 const Function &F = MF.getFunction(); 7587 7588 // Can F be deduplicated by the linker? If it can, don't outline from it. 7589 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 7590 return false; 7591 7592 // Don't outline from functions with section markings; the program could 7593 // expect that all the code is in the named section. 7594 // FIXME: Allow outlining from multiple functions with the same section 7595 // marking. 7596 if (F.hasSection()) 7597 return false; 7598 7599 // Outlining from functions with redzones is unsafe since the outliner may 7600 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 7601 // outline from it. 7602 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 7603 if (!AFI || AFI->hasRedZone().value_or(true)) 7604 return false; 7605 7606 // FIXME: Teach the outliner to generate/handle Windows unwind info. 7607 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 7608 return false; 7609 7610 // It's safe to outline from MF. 7611 return true; 7612 } 7613 7614 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 7615 unsigned &Flags) const { 7616 if (!TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags)) 7617 return false; 7618 // Check if LR is available through all of the MBB. If it's not, then set 7619 // a flag. 7620 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 7621 "Suitable Machine Function for outlining must track liveness"); 7622 LiveRegUnits LRU(getRegisterInfo()); 7623 7624 for (MachineInstr &MI : llvm::reverse(MBB)) 7625 LRU.accumulate(MI); 7626 7627 // Check if each of the unsafe registers are available... 7628 bool W16AvailableInBlock = LRU.available(AArch64::W16); 7629 bool W17AvailableInBlock = LRU.available(AArch64::W17); 7630 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 7631 7632 // If all of these are dead (and not live out), we know we don't have to check 7633 // them later. 7634 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 7635 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 7636 7637 // Now, add the live outs to the set. 7638 LRU.addLiveOuts(MBB); 7639 7640 // If any of these registers is available in the MBB, but also a live out of 7641 // the block, then we know outlining is unsafe. 7642 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 7643 return false; 7644 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 7645 return false; 7646 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 7647 return false; 7648 7649 // Check if there's a call inside this MachineBasicBlock. If there is, then 7650 // set a flag. 7651 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 7652 Flags |= MachineOutlinerMBBFlags::HasCalls; 7653 7654 MachineFunction *MF = MBB.getParent(); 7655 7656 // In the event that we outline, we may have to save LR. If there is an 7657 // available register in the MBB, then we'll always save LR there. Check if 7658 // this is true. 7659 bool CanSaveLR = false; 7660 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 7661 MF->getSubtarget().getRegisterInfo()); 7662 7663 // Check if there is an available register across the sequence that we can 7664 // use. 7665 for (unsigned Reg : AArch64::GPR64RegClass) { 7666 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 7667 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 7668 CanSaveLR = true; 7669 break; 7670 } 7671 } 7672 7673 // Check if we have a register we can save LR to, and if LR was used 7674 // somewhere. If both of those things are true, then we need to evaluate the 7675 // safety of outlining stack instructions later. 7676 if (!CanSaveLR && !LRU.available(AArch64::LR)) 7677 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 7678 7679 return true; 7680 } 7681 7682 outliner::InstrType 7683 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 7684 unsigned Flags) const { 7685 MachineInstr &MI = *MIT; 7686 MachineBasicBlock *MBB = MI.getParent(); 7687 MachineFunction *MF = MBB->getParent(); 7688 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 7689 7690 // Don't outline anything used for return address signing. The outlined 7691 // function will get signed later if needed 7692 switch (MI.getOpcode()) { 7693 case AArch64::PACIASP: 7694 case AArch64::PACIBSP: 7695 case AArch64::AUTIASP: 7696 case AArch64::AUTIBSP: 7697 case AArch64::RETAA: 7698 case AArch64::RETAB: 7699 case AArch64::EMITBKEY: 7700 return outliner::InstrType::Illegal; 7701 } 7702 7703 // Don't outline LOHs. 7704 if (FuncInfo->getLOHRelated().count(&MI)) 7705 return outliner::InstrType::Illegal; 7706 7707 // We can only outline these if we will tail call the outlined function, or 7708 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 7709 // in a tail call. 7710 // 7711 // FIXME: If the proper fixups for the offset are implemented, this should be 7712 // possible. 7713 if (MI.isCFIInstruction()) 7714 return outliner::InstrType::Legal; 7715 7716 // Don't allow debug values to impact outlining type. 7717 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 7718 return outliner::InstrType::Invisible; 7719 7720 // At this point, KILL instructions don't really tell us much so we can go 7721 // ahead and skip over them. 7722 if (MI.isKill()) 7723 return outliner::InstrType::Invisible; 7724 7725 // Is this a terminator for a basic block? 7726 if (MI.isTerminator()) { 7727 7728 // Is this the end of a function? 7729 if (MI.getParent()->succ_empty()) 7730 return outliner::InstrType::Legal; 7731 7732 // It's not, so don't outline it. 7733 return outliner::InstrType::Illegal; 7734 } 7735 7736 // Make sure none of the operands are un-outlinable. 7737 for (const MachineOperand &MOP : MI.operands()) { 7738 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 7739 MOP.isTargetIndex()) 7740 return outliner::InstrType::Illegal; 7741 7742 // If it uses LR or W30 explicitly, then don't touch it. 7743 if (MOP.isReg() && !MOP.isImplicit() && 7744 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 7745 return outliner::InstrType::Illegal; 7746 } 7747 7748 // Special cases for instructions that can always be outlined, but will fail 7749 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 7750 // be outlined because they don't require a *specific* value to be in LR. 7751 if (MI.getOpcode() == AArch64::ADRP) 7752 return outliner::InstrType::Legal; 7753 7754 // If MI is a call we might be able to outline it. We don't want to outline 7755 // any calls that rely on the position of items on the stack. When we outline 7756 // something containing a call, we have to emit a save and restore of LR in 7757 // the outlined function. Currently, this always happens by saving LR to the 7758 // stack. Thus, if we outline, say, half the parameters for a function call 7759 // plus the call, then we'll break the callee's expectations for the layout 7760 // of the stack. 7761 // 7762 // FIXME: Allow calls to functions which construct a stack frame, as long 7763 // as they don't access arguments on the stack. 7764 // FIXME: Figure out some way to analyze functions defined in other modules. 7765 // We should be able to compute the memory usage based on the IR calling 7766 // convention, even if we can't see the definition. 7767 if (MI.isCall()) { 7768 // Get the function associated with the call. Look at each operand and find 7769 // the one that represents the callee and get its name. 7770 const Function *Callee = nullptr; 7771 for (const MachineOperand &MOP : MI.operands()) { 7772 if (MOP.isGlobal()) { 7773 Callee = dyn_cast<Function>(MOP.getGlobal()); 7774 break; 7775 } 7776 } 7777 7778 // Never outline calls to mcount. There isn't any rule that would require 7779 // this, but the Linux kernel's "ftrace" feature depends on it. 7780 if (Callee && Callee->getName() == "\01_mcount") 7781 return outliner::InstrType::Illegal; 7782 7783 // If we don't know anything about the callee, assume it depends on the 7784 // stack layout of the caller. In that case, it's only legal to outline 7785 // as a tail-call. Explicitly list the call instructions we know about so we 7786 // don't get unexpected results with call pseudo-instructions. 7787 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7788 if (MI.getOpcode() == AArch64::BLR || 7789 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7790 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7791 7792 if (!Callee) 7793 return UnknownCallOutlineType; 7794 7795 // We have a function we have information about. Check it if it's something 7796 // can safely outline. 7797 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7798 7799 // We don't know what's going on with the callee at all. Don't touch it. 7800 if (!CalleeMF) 7801 return UnknownCallOutlineType; 7802 7803 // Check if we know anything about the callee saves on the function. If we 7804 // don't, then don't touch it, since that implies that we haven't 7805 // computed anything about its stack frame yet. 7806 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7807 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7808 MFI.getNumObjects() > 0) 7809 return UnknownCallOutlineType; 7810 7811 // At this point, we can say that CalleeMF ought to not pass anything on the 7812 // stack. Therefore, we can outline it. 7813 return outliner::InstrType::Legal; 7814 } 7815 7816 // Don't outline positions. 7817 if (MI.isPosition()) 7818 return outliner::InstrType::Illegal; 7819 7820 // Don't touch the link register or W30. 7821 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7822 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7823 return outliner::InstrType::Illegal; 7824 7825 // Don't outline BTI instructions, because that will prevent the outlining 7826 // site from being indirectly callable. 7827 if (MI.getOpcode() == AArch64::HINT) { 7828 int64_t Imm = MI.getOperand(0).getImm(); 7829 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7830 return outliner::InstrType::Illegal; 7831 } 7832 7833 return outliner::InstrType::Legal; 7834 } 7835 7836 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7837 for (MachineInstr &MI : MBB) { 7838 const MachineOperand *Base; 7839 unsigned Width; 7840 int64_t Offset; 7841 bool OffsetIsScalable; 7842 7843 // Is this a load or store with an immediate offset with SP as the base? 7844 if (!MI.mayLoadOrStore() || 7845 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7846 &RI) || 7847 (Base->isReg() && Base->getReg() != AArch64::SP)) 7848 continue; 7849 7850 // It is, so we have to fix it up. 7851 TypeSize Scale(0U, false); 7852 int64_t Dummy1, Dummy2; 7853 7854 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7855 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7856 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7857 assert(Scale != 0 && "Unexpected opcode!"); 7858 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7859 7860 // We've pushed the return address to the stack, so add 16 to the offset. 7861 // This is safe, since we already checked if it would overflow when we 7862 // checked if this instruction was legal to outline. 7863 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue(); 7864 StackOffsetOperand.setImm(NewImm); 7865 } 7866 } 7867 7868 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7869 bool ShouldSignReturnAddr, 7870 bool ShouldSignReturnAddrWithBKey) { 7871 if (ShouldSignReturnAddr) { 7872 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7873 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7874 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7875 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7876 DebugLoc DL; 7877 7878 if (MBBAUT != MBB.end()) 7879 DL = MBBAUT->getDebugLoc(); 7880 7881 // At the very beginning of the basic block we insert the following 7882 // depending on the key type 7883 // 7884 // a_key: b_key: 7885 // PACIASP EMITBKEY 7886 // CFI_INSTRUCTION PACIBSP 7887 // CFI_INSTRUCTION 7888 if (ShouldSignReturnAddrWithBKey) { 7889 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 7890 .setMIFlag(MachineInstr::FrameSetup); 7891 } 7892 7893 BuildMI(MBB, MBBPAC, DebugLoc(), 7894 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::PACIBSP 7895 : AArch64::PACIASP)) 7896 .setMIFlag(MachineInstr::FrameSetup); 7897 7898 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 7899 unsigned CFIIndex = 7900 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 7901 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 7902 .addCFIIndex(CFIIndex) 7903 .setMIFlags(MachineInstr::FrameSetup); 7904 } 7905 7906 // If v8.3a features are available we can replace a RET instruction by 7907 // RETAA or RETAB and omit the AUT instructions. In this case the 7908 // DW_CFA_AARCH64_negate_ra_state can't be emitted. 7909 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 7910 MBBAUT->getOpcode() == AArch64::RET) { 7911 BuildMI(MBB, MBBAUT, DL, 7912 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::RETAB 7913 : AArch64::RETAA)) 7914 .copyImplicitOps(*MBBAUT); 7915 MBB.erase(MBBAUT); 7916 } else { 7917 BuildMI(MBB, MBBAUT, DL, 7918 TII->get(ShouldSignReturnAddrWithBKey ? AArch64::AUTIBSP 7919 : AArch64::AUTIASP)) 7920 .setMIFlag(MachineInstr::FrameDestroy); 7921 unsigned CFIIndexAuth = 7922 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 7923 BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) 7924 .addCFIIndex(CFIIndexAuth) 7925 .setMIFlags(MachineInstr::FrameDestroy); 7926 } 7927 } 7928 } 7929 7930 void AArch64InstrInfo::buildOutlinedFrame( 7931 MachineBasicBlock &MBB, MachineFunction &MF, 7932 const outliner::OutlinedFunction &OF) const { 7933 7934 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 7935 7936 if (OF.FrameConstructionID == MachineOutlinerTailCall) 7937 FI->setOutliningStyle("Tail Call"); 7938 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 7939 // For thunk outlining, rewrite the last instruction from a call to a 7940 // tail-call. 7941 MachineInstr *Call = &*--MBB.instr_end(); 7942 unsigned TailOpcode; 7943 if (Call->getOpcode() == AArch64::BL) { 7944 TailOpcode = AArch64::TCRETURNdi; 7945 } else { 7946 assert(Call->getOpcode() == AArch64::BLR || 7947 Call->getOpcode() == AArch64::BLRNoIP); 7948 TailOpcode = AArch64::TCRETURNriALL; 7949 } 7950 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 7951 .add(Call->getOperand(0)) 7952 .addImm(0); 7953 MBB.insert(MBB.end(), TC); 7954 Call->eraseFromParent(); 7955 7956 FI->setOutliningStyle("Thunk"); 7957 } 7958 7959 bool IsLeafFunction = true; 7960 7961 // Is there a call in the outlined range? 7962 auto IsNonTailCall = [](const MachineInstr &MI) { 7963 return MI.isCall() && !MI.isReturn(); 7964 }; 7965 7966 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 7967 // Fix up the instructions in the range, since we're going to modify the 7968 // stack. 7969 7970 // Bugzilla ID: 46767 7971 // TODO: Check if fixing up twice is safe so we can outline these. 7972 assert(OF.FrameConstructionID != MachineOutlinerDefault && 7973 "Can only fix up stack references once"); 7974 fixupPostOutline(MBB); 7975 7976 IsLeafFunction = false; 7977 7978 // LR has to be a live in so that we can save it. 7979 if (!MBB.isLiveIn(AArch64::LR)) 7980 MBB.addLiveIn(AArch64::LR); 7981 7982 MachineBasicBlock::iterator It = MBB.begin(); 7983 MachineBasicBlock::iterator Et = MBB.end(); 7984 7985 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7986 OF.FrameConstructionID == MachineOutlinerThunk) 7987 Et = std::prev(MBB.end()); 7988 7989 // Insert a save before the outlined region 7990 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7991 .addReg(AArch64::SP, RegState::Define) 7992 .addReg(AArch64::LR) 7993 .addReg(AArch64::SP) 7994 .addImm(-16); 7995 It = MBB.insert(It, STRXpre); 7996 7997 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) { 7998 const TargetSubtargetInfo &STI = MF.getSubtarget(); 7999 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 8000 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 8001 8002 // Add a CFI saying the stack was moved 16 B down. 8003 int64_t StackPosEntry = 8004 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 8005 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8006 .addCFIIndex(StackPosEntry) 8007 .setMIFlags(MachineInstr::FrameSetup); 8008 8009 // Add a CFI saying that the LR that we want to find is now 16 B higher 8010 // than before. 8011 int64_t LRPosEntry = MF.addFrameInst( 8012 MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 8013 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 8014 .addCFIIndex(LRPosEntry) 8015 .setMIFlags(MachineInstr::FrameSetup); 8016 } 8017 8018 // Insert a restore before the terminator for the function. 8019 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8020 .addReg(AArch64::SP, RegState::Define) 8021 .addReg(AArch64::LR, RegState::Define) 8022 .addReg(AArch64::SP) 8023 .addImm(16); 8024 Et = MBB.insert(Et, LDRXpost); 8025 } 8026 8027 // If a bunch of candidates reach this point they must agree on their return 8028 // address signing. It is therefore enough to just consider the signing 8029 // behaviour of one of them 8030 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 8031 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 8032 8033 // a_key is the default 8034 bool ShouldSignReturnAddrWithBKey = MFI.shouldSignWithBKey(); 8035 8036 // If this is a tail call outlined function, then there's already a return. 8037 if (OF.FrameConstructionID == MachineOutlinerTailCall || 8038 OF.FrameConstructionID == MachineOutlinerThunk) { 8039 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8040 ShouldSignReturnAddrWithBKey); 8041 return; 8042 } 8043 8044 // It's not a tail call, so we have to insert the return ourselves. 8045 8046 // LR has to be a live in so that we can return to it. 8047 if (!MBB.isLiveIn(AArch64::LR)) 8048 MBB.addLiveIn(AArch64::LR); 8049 8050 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 8051 .addReg(AArch64::LR); 8052 MBB.insert(MBB.end(), ret); 8053 8054 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 8055 ShouldSignReturnAddrWithBKey); 8056 8057 FI->setOutliningStyle("Function"); 8058 8059 // Did we have to modify the stack by saving the link register? 8060 if (OF.FrameConstructionID != MachineOutlinerDefault) 8061 return; 8062 8063 // We modified the stack. 8064 // Walk over the basic block and fix up all the stack accesses. 8065 fixupPostOutline(MBB); 8066 } 8067 8068 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 8069 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 8070 MachineFunction &MF, outliner::Candidate &C) const { 8071 8072 // Are we tail calling? 8073 if (C.CallConstructionID == MachineOutlinerTailCall) { 8074 // If yes, then we can just branch to the label. 8075 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 8076 .addGlobalAddress(M.getNamedValue(MF.getName())) 8077 .addImm(0)); 8078 return It; 8079 } 8080 8081 // Are we saving the link register? 8082 if (C.CallConstructionID == MachineOutlinerNoLRSave || 8083 C.CallConstructionID == MachineOutlinerThunk) { 8084 // No, so just insert the call. 8085 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8086 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8087 return It; 8088 } 8089 8090 // We want to return the spot where we inserted the call. 8091 MachineBasicBlock::iterator CallPt; 8092 8093 // Instructions for saving and restoring LR around the call instruction we're 8094 // going to insert. 8095 MachineInstr *Save; 8096 MachineInstr *Restore; 8097 // Can we save to a register? 8098 if (C.CallConstructionID == MachineOutlinerRegSave) { 8099 // FIXME: This logic should be sunk into a target-specific interface so that 8100 // we don't have to recompute the register. 8101 Register Reg = findRegisterToSaveLRTo(C); 8102 assert(Reg && "No callee-saved register available?"); 8103 8104 // LR has to be a live in so that we can save it. 8105 if (!MBB.isLiveIn(AArch64::LR)) 8106 MBB.addLiveIn(AArch64::LR); 8107 8108 // Save and restore LR from Reg. 8109 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 8110 .addReg(AArch64::XZR) 8111 .addReg(AArch64::LR) 8112 .addImm(0); 8113 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 8114 .addReg(AArch64::XZR) 8115 .addReg(Reg) 8116 .addImm(0); 8117 } else { 8118 // We have the default case. Save and restore from SP. 8119 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 8120 .addReg(AArch64::SP, RegState::Define) 8121 .addReg(AArch64::LR) 8122 .addReg(AArch64::SP) 8123 .addImm(-16); 8124 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 8125 .addReg(AArch64::SP, RegState::Define) 8126 .addReg(AArch64::LR, RegState::Define) 8127 .addReg(AArch64::SP) 8128 .addImm(16); 8129 } 8130 8131 It = MBB.insert(It, Save); 8132 It++; 8133 8134 // Insert the call. 8135 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 8136 .addGlobalAddress(M.getNamedValue(MF.getName()))); 8137 CallPt = It; 8138 It++; 8139 8140 It = MBB.insert(It, Restore); 8141 return CallPt; 8142 } 8143 8144 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 8145 MachineFunction &MF) const { 8146 return MF.getFunction().hasMinSize(); 8147 } 8148 8149 std::optional<DestSourcePair> 8150 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 8151 8152 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 8153 // and zero immediate operands used as an alias for mov instruction. 8154 if (MI.getOpcode() == AArch64::ORRWrs && 8155 MI.getOperand(1).getReg() == AArch64::WZR && 8156 MI.getOperand(3).getImm() == 0x0) { 8157 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8158 } 8159 8160 if (MI.getOpcode() == AArch64::ORRXrs && 8161 MI.getOperand(1).getReg() == AArch64::XZR && 8162 MI.getOperand(3).getImm() == 0x0) { 8163 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 8164 } 8165 8166 return std::nullopt; 8167 } 8168 8169 std::optional<RegImmPair> 8170 AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const { 8171 int Sign = 1; 8172 int64_t Offset = 0; 8173 8174 // TODO: Handle cases where Reg is a super- or sub-register of the 8175 // destination register. 8176 const MachineOperand &Op0 = MI.getOperand(0); 8177 if (!Op0.isReg() || Reg != Op0.getReg()) 8178 return std::nullopt; 8179 8180 switch (MI.getOpcode()) { 8181 default: 8182 return std::nullopt; 8183 case AArch64::SUBWri: 8184 case AArch64::SUBXri: 8185 case AArch64::SUBSWri: 8186 case AArch64::SUBSXri: 8187 Sign *= -1; 8188 [[fallthrough]]; 8189 case AArch64::ADDSWri: 8190 case AArch64::ADDSXri: 8191 case AArch64::ADDWri: 8192 case AArch64::ADDXri: { 8193 // TODO: Third operand can be global address (usually some string). 8194 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 8195 !MI.getOperand(2).isImm()) 8196 return std::nullopt; 8197 int Shift = MI.getOperand(3).getImm(); 8198 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 8199 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 8200 } 8201 } 8202 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 8203 } 8204 8205 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 8206 /// the destination register then, if possible, describe the value in terms of 8207 /// the source register. 8208 static std::optional<ParamLoadedValue> 8209 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 8210 const TargetInstrInfo *TII, 8211 const TargetRegisterInfo *TRI) { 8212 auto DestSrc = TII->isCopyInstr(MI); 8213 if (!DestSrc) 8214 return std::nullopt; 8215 8216 Register DestReg = DestSrc->Destination->getReg(); 8217 Register SrcReg = DestSrc->Source->getReg(); 8218 8219 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 8220 8221 // If the described register is the destination, just return the source. 8222 if (DestReg == DescribedReg) 8223 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8224 8225 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 8226 if (MI.getOpcode() == AArch64::ORRWrs && 8227 TRI->isSuperRegister(DestReg, DescribedReg)) 8228 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 8229 8230 // We may need to describe the lower part of a ORRXrs move. 8231 if (MI.getOpcode() == AArch64::ORRXrs && 8232 TRI->isSubRegister(DestReg, DescribedReg)) { 8233 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 8234 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 8235 } 8236 8237 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 8238 "Unhandled ORR[XW]rs copy case"); 8239 8240 return std::nullopt; 8241 } 8242 8243 std::optional<ParamLoadedValue> 8244 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 8245 Register Reg) const { 8246 const MachineFunction *MF = MI.getMF(); 8247 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 8248 switch (MI.getOpcode()) { 8249 case AArch64::MOVZWi: 8250 case AArch64::MOVZXi: { 8251 // MOVZWi may be used for producing zero-extended 32-bit immediates in 8252 // 64-bit parameters, so we need to consider super-registers. 8253 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 8254 return std::nullopt; 8255 8256 if (!MI.getOperand(1).isImm()) 8257 return std::nullopt; 8258 int64_t Immediate = MI.getOperand(1).getImm(); 8259 int Shift = MI.getOperand(2).getImm(); 8260 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 8261 nullptr); 8262 } 8263 case AArch64::ORRWrs: 8264 case AArch64::ORRXrs: 8265 return describeORRLoadedValue(MI, Reg, this, TRI); 8266 } 8267 8268 return TargetInstrInfo::describeLoadedValue(MI, Reg); 8269 } 8270 8271 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 8272 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 8273 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 8274 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 8275 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 8276 8277 // Anyexts are nops. 8278 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 8279 return true; 8280 8281 Register DefReg = ExtMI.getOperand(0).getReg(); 8282 if (!MRI.hasOneNonDBGUse(DefReg)) 8283 return false; 8284 8285 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 8286 // addressing mode. 8287 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 8288 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 8289 } 8290 8291 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 8292 return get(Opc).TSFlags & AArch64::ElementSizeMask; 8293 } 8294 8295 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 8296 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 8297 } 8298 8299 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 8300 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 8301 } 8302 8303 unsigned int 8304 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 8305 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 8306 } 8307 8308 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 8309 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 8310 return AArch64::BLRNoIP; 8311 else 8312 return AArch64::BLR; 8313 } 8314 8315 #define GET_INSTRINFO_HELPERS 8316 #define GET_INSTRMAP_INFO 8317 #include "AArch64GenInstrInfo.inc" 8318