1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineModuleInfo.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstBuilder.h" 39 #include "llvm/MC/MCInstrDesc.h" 40 #include "llvm/Support/Casting.h" 41 #include "llvm/Support/CodeGen.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Compiler.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/MathExtras.h" 46 #include "llvm/Target/TargetMachine.h" 47 #include "llvm/Target/TargetOptions.h" 48 #include <cassert> 49 #include <cstdint> 50 #include <iterator> 51 #include <utility> 52 53 using namespace llvm; 54 55 #define GET_INSTRINFO_CTOR_DTOR 56 #include "AArch64GenInstrInfo.inc" 57 58 static cl::opt<unsigned> TBZDisplacementBits( 59 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 60 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 61 62 static cl::opt<unsigned> CBZDisplacementBits( 63 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 64 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 65 66 static cl::opt<unsigned> 67 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 68 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 69 70 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 71 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 72 AArch64::CATCHRET), 73 RI(STI.getTargetTriple()), Subtarget(STI) {} 74 75 /// GetInstSize - Return the number of bytes of code the specified 76 /// instruction may be. This returns the maximum number of bytes. 77 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 78 const MachineBasicBlock &MBB = *MI.getParent(); 79 const MachineFunction *MF = MBB.getParent(); 80 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 81 82 { 83 auto Op = MI.getOpcode(); 84 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 85 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 86 } 87 88 // Meta-instructions emit no code. 89 if (MI.isMetaInstruction()) 90 return 0; 91 92 // FIXME: We currently only handle pseudoinstructions that don't get expanded 93 // before the assembly printer. 94 unsigned NumBytes = 0; 95 const MCInstrDesc &Desc = MI.getDesc(); 96 switch (Desc.getOpcode()) { 97 default: 98 // Anything not explicitly designated otherwise is a normal 4-byte insn. 99 NumBytes = 4; 100 break; 101 case TargetOpcode::STACKMAP: 102 // The upper bound for a stackmap intrinsic is the full length of its shadow 103 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 104 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 105 break; 106 case TargetOpcode::PATCHPOINT: 107 // The size of the patchpoint intrinsic is the number of bytes requested 108 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 109 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 110 break; 111 case TargetOpcode::STATEPOINT: 112 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 113 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 114 // No patch bytes means a normal call inst is emitted 115 if (NumBytes == 0) 116 NumBytes = 4; 117 break; 118 case AArch64::TLSDESC_CALLSEQ: 119 // This gets lowered to an instruction sequence which takes 16 bytes 120 NumBytes = 16; 121 break; 122 case AArch64::SpeculationBarrierISBDSBEndBB: 123 // This gets lowered to 2 4-byte instructions. 124 NumBytes = 8; 125 break; 126 case AArch64::SpeculationBarrierSBEndBB: 127 // This gets lowered to 1 4-byte instructions. 128 NumBytes = 4; 129 break; 130 case AArch64::JumpTableDest32: 131 case AArch64::JumpTableDest16: 132 case AArch64::JumpTableDest8: 133 NumBytes = 12; 134 break; 135 case AArch64::SPACE: 136 NumBytes = MI.getOperand(1).getImm(); 137 break; 138 case AArch64::StoreSwiftAsyncContext: 139 NumBytes = 20; 140 break; 141 case TargetOpcode::BUNDLE: 142 NumBytes = getInstBundleLength(MI); 143 break; 144 } 145 146 return NumBytes; 147 } 148 149 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 150 unsigned Size = 0; 151 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 152 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 153 while (++I != E && I->isInsideBundle()) { 154 assert(!I->isBundle() && "No nested bundle!"); 155 Size += getInstSizeInBytes(*I); 156 } 157 return Size; 158 } 159 160 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 161 SmallVectorImpl<MachineOperand> &Cond) { 162 // Block ends with fall-through condbranch. 163 switch (LastInst->getOpcode()) { 164 default: 165 llvm_unreachable("Unknown branch instruction?"); 166 case AArch64::Bcc: 167 Target = LastInst->getOperand(1).getMBB(); 168 Cond.push_back(LastInst->getOperand(0)); 169 break; 170 case AArch64::CBZW: 171 case AArch64::CBZX: 172 case AArch64::CBNZW: 173 case AArch64::CBNZX: 174 Target = LastInst->getOperand(1).getMBB(); 175 Cond.push_back(MachineOperand::CreateImm(-1)); 176 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 177 Cond.push_back(LastInst->getOperand(0)); 178 break; 179 case AArch64::TBZW: 180 case AArch64::TBZX: 181 case AArch64::TBNZW: 182 case AArch64::TBNZX: 183 Target = LastInst->getOperand(2).getMBB(); 184 Cond.push_back(MachineOperand::CreateImm(-1)); 185 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 186 Cond.push_back(LastInst->getOperand(0)); 187 Cond.push_back(LastInst->getOperand(1)); 188 } 189 } 190 191 static unsigned getBranchDisplacementBits(unsigned Opc) { 192 switch (Opc) { 193 default: 194 llvm_unreachable("unexpected opcode!"); 195 case AArch64::B: 196 return 64; 197 case AArch64::TBNZW: 198 case AArch64::TBZW: 199 case AArch64::TBNZX: 200 case AArch64::TBZX: 201 return TBZDisplacementBits; 202 case AArch64::CBNZW: 203 case AArch64::CBZW: 204 case AArch64::CBNZX: 205 case AArch64::CBZX: 206 return CBZDisplacementBits; 207 case AArch64::Bcc: 208 return BCCDisplacementBits; 209 } 210 } 211 212 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 213 int64_t BrOffset) const { 214 unsigned Bits = getBranchDisplacementBits(BranchOp); 215 assert(Bits >= 3 && "max branch displacement must be enough to jump" 216 "over conditional branch expansion"); 217 return isIntN(Bits, BrOffset / 4); 218 } 219 220 MachineBasicBlock * 221 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 222 switch (MI.getOpcode()) { 223 default: 224 llvm_unreachable("unexpected opcode!"); 225 case AArch64::B: 226 return MI.getOperand(0).getMBB(); 227 case AArch64::TBZW: 228 case AArch64::TBNZW: 229 case AArch64::TBZX: 230 case AArch64::TBNZX: 231 return MI.getOperand(2).getMBB(); 232 case AArch64::CBZW: 233 case AArch64::CBNZW: 234 case AArch64::CBZX: 235 case AArch64::CBNZX: 236 case AArch64::Bcc: 237 return MI.getOperand(1).getMBB(); 238 } 239 } 240 241 // Branch analysis. 242 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 243 MachineBasicBlock *&TBB, 244 MachineBasicBlock *&FBB, 245 SmallVectorImpl<MachineOperand> &Cond, 246 bool AllowModify) const { 247 // If the block has no terminators, it just falls into the block after it. 248 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 249 if (I == MBB.end()) 250 return false; 251 252 // Skip over SpeculationBarrierEndBB terminators 253 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 254 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 255 --I; 256 } 257 258 if (!isUnpredicatedTerminator(*I)) 259 return false; 260 261 // Get the last instruction in the block. 262 MachineInstr *LastInst = &*I; 263 264 // If there is only one terminator instruction, process it. 265 unsigned LastOpc = LastInst->getOpcode(); 266 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 267 if (isUncondBranchOpcode(LastOpc)) { 268 TBB = LastInst->getOperand(0).getMBB(); 269 return false; 270 } 271 if (isCondBranchOpcode(LastOpc)) { 272 // Block ends with fall-through condbranch. 273 parseCondBranch(LastInst, TBB, Cond); 274 return false; 275 } 276 return true; // Can't handle indirect branch. 277 } 278 279 // Get the instruction before it if it is a terminator. 280 MachineInstr *SecondLastInst = &*I; 281 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 282 283 // If AllowModify is true and the block ends with two or more unconditional 284 // branches, delete all but the first unconditional branch. 285 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 286 while (isUncondBranchOpcode(SecondLastOpc)) { 287 LastInst->eraseFromParent(); 288 LastInst = SecondLastInst; 289 LastOpc = LastInst->getOpcode(); 290 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 291 // Return now the only terminator is an unconditional branch. 292 TBB = LastInst->getOperand(0).getMBB(); 293 return false; 294 } else { 295 SecondLastInst = &*I; 296 SecondLastOpc = SecondLastInst->getOpcode(); 297 } 298 } 299 } 300 301 // If we're allowed to modify and the block ends in a unconditional branch 302 // which could simply fallthrough, remove the branch. (Note: This case only 303 // matters when we can't understand the whole sequence, otherwise it's also 304 // handled by BranchFolding.cpp.) 305 if (AllowModify && isUncondBranchOpcode(LastOpc) && 306 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 307 LastInst->eraseFromParent(); 308 LastInst = SecondLastInst; 309 LastOpc = LastInst->getOpcode(); 310 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 311 assert(!isUncondBranchOpcode(LastOpc) && 312 "unreachable unconditional branches removed above"); 313 314 if (isCondBranchOpcode(LastOpc)) { 315 // Block ends with fall-through condbranch. 316 parseCondBranch(LastInst, TBB, Cond); 317 return false; 318 } 319 return true; // Can't handle indirect branch. 320 } else { 321 SecondLastInst = &*I; 322 SecondLastOpc = SecondLastInst->getOpcode(); 323 } 324 } 325 326 // If there are three terminators, we don't know what sort of block this is. 327 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 328 return true; 329 330 // If the block ends with a B and a Bcc, handle it. 331 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 332 parseCondBranch(SecondLastInst, TBB, Cond); 333 FBB = LastInst->getOperand(0).getMBB(); 334 return false; 335 } 336 337 // If the block ends with two unconditional branches, handle it. The second 338 // one is not executed, so remove it. 339 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 340 TBB = SecondLastInst->getOperand(0).getMBB(); 341 I = LastInst; 342 if (AllowModify) 343 I->eraseFromParent(); 344 return false; 345 } 346 347 // ...likewise if it ends with an indirect branch followed by an unconditional 348 // branch. 349 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 350 I = LastInst; 351 if (AllowModify) 352 I->eraseFromParent(); 353 return true; 354 } 355 356 // Otherwise, can't handle this. 357 return true; 358 } 359 360 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 361 MachineBranchPredicate &MBP, 362 bool AllowModify) const { 363 // For the moment, handle only a block which ends with a cb(n)zx followed by 364 // a fallthrough. Why this? Because it is a common form. 365 // TODO: Should we handle b.cc? 366 367 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 368 if (I == MBB.end()) 369 return true; 370 371 // Skip over SpeculationBarrierEndBB terminators 372 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 373 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 374 --I; 375 } 376 377 if (!isUnpredicatedTerminator(*I)) 378 return true; 379 380 // Get the last instruction in the block. 381 MachineInstr *LastInst = &*I; 382 unsigned LastOpc = LastInst->getOpcode(); 383 if (!isCondBranchOpcode(LastOpc)) 384 return true; 385 386 switch (LastOpc) { 387 default: 388 return true; 389 case AArch64::CBZW: 390 case AArch64::CBZX: 391 case AArch64::CBNZW: 392 case AArch64::CBNZX: 393 break; 394 }; 395 396 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 397 assert(MBP.TrueDest && "expected!"); 398 MBP.FalseDest = MBB.getNextNode(); 399 400 MBP.ConditionDef = nullptr; 401 MBP.SingleUseCondition = false; 402 403 MBP.LHS = LastInst->getOperand(0); 404 MBP.RHS = MachineOperand::CreateImm(0); 405 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 406 : MachineBranchPredicate::PRED_EQ; 407 return false; 408 } 409 410 bool AArch64InstrInfo::reverseBranchCondition( 411 SmallVectorImpl<MachineOperand> &Cond) const { 412 if (Cond[0].getImm() != -1) { 413 // Regular Bcc 414 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 415 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 416 } else { 417 // Folded compare-and-branch 418 switch (Cond[1].getImm()) { 419 default: 420 llvm_unreachable("Unknown conditional branch!"); 421 case AArch64::CBZW: 422 Cond[1].setImm(AArch64::CBNZW); 423 break; 424 case AArch64::CBNZW: 425 Cond[1].setImm(AArch64::CBZW); 426 break; 427 case AArch64::CBZX: 428 Cond[1].setImm(AArch64::CBNZX); 429 break; 430 case AArch64::CBNZX: 431 Cond[1].setImm(AArch64::CBZX); 432 break; 433 case AArch64::TBZW: 434 Cond[1].setImm(AArch64::TBNZW); 435 break; 436 case AArch64::TBNZW: 437 Cond[1].setImm(AArch64::TBZW); 438 break; 439 case AArch64::TBZX: 440 Cond[1].setImm(AArch64::TBNZX); 441 break; 442 case AArch64::TBNZX: 443 Cond[1].setImm(AArch64::TBZX); 444 break; 445 } 446 } 447 448 return false; 449 } 450 451 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 452 int *BytesRemoved) const { 453 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 454 if (I == MBB.end()) 455 return 0; 456 457 if (!isUncondBranchOpcode(I->getOpcode()) && 458 !isCondBranchOpcode(I->getOpcode())) 459 return 0; 460 461 // Remove the branch. 462 I->eraseFromParent(); 463 464 I = MBB.end(); 465 466 if (I == MBB.begin()) { 467 if (BytesRemoved) 468 *BytesRemoved = 4; 469 return 1; 470 } 471 --I; 472 if (!isCondBranchOpcode(I->getOpcode())) { 473 if (BytesRemoved) 474 *BytesRemoved = 4; 475 return 1; 476 } 477 478 // Remove the branch. 479 I->eraseFromParent(); 480 if (BytesRemoved) 481 *BytesRemoved = 8; 482 483 return 2; 484 } 485 486 void AArch64InstrInfo::instantiateCondBranch( 487 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 488 ArrayRef<MachineOperand> Cond) const { 489 if (Cond[0].getImm() != -1) { 490 // Regular Bcc 491 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 492 } else { 493 // Folded compare-and-branch 494 // Note that we use addOperand instead of addReg to keep the flags. 495 const MachineInstrBuilder MIB = 496 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 497 if (Cond.size() > 3) 498 MIB.addImm(Cond[3].getImm()); 499 MIB.addMBB(TBB); 500 } 501 } 502 503 unsigned AArch64InstrInfo::insertBranch( 504 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 505 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 506 // Shouldn't be a fall through. 507 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 508 509 if (!FBB) { 510 if (Cond.empty()) // Unconditional branch? 511 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 512 else 513 instantiateCondBranch(MBB, DL, TBB, Cond); 514 515 if (BytesAdded) 516 *BytesAdded = 4; 517 518 return 1; 519 } 520 521 // Two-way conditional branch. 522 instantiateCondBranch(MBB, DL, TBB, Cond); 523 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 524 525 if (BytesAdded) 526 *BytesAdded = 8; 527 528 return 2; 529 } 530 531 // Find the original register that VReg is copied from. 532 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 533 while (Register::isVirtualRegister(VReg)) { 534 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 535 if (!DefMI->isFullCopy()) 536 return VReg; 537 VReg = DefMI->getOperand(1).getReg(); 538 } 539 return VReg; 540 } 541 542 // Determine if VReg is defined by an instruction that can be folded into a 543 // csel instruction. If so, return the folded opcode, and the replacement 544 // register. 545 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 546 unsigned *NewVReg = nullptr) { 547 VReg = removeCopies(MRI, VReg); 548 if (!Register::isVirtualRegister(VReg)) 549 return 0; 550 551 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 552 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 553 unsigned Opc = 0; 554 unsigned SrcOpNum = 0; 555 switch (DefMI->getOpcode()) { 556 case AArch64::ADDSXri: 557 case AArch64::ADDSWri: 558 // if NZCV is used, do not fold. 559 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 560 return 0; 561 // fall-through to ADDXri and ADDWri. 562 LLVM_FALLTHROUGH; 563 case AArch64::ADDXri: 564 case AArch64::ADDWri: 565 // add x, 1 -> csinc. 566 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 567 DefMI->getOperand(3).getImm() != 0) 568 return 0; 569 SrcOpNum = 1; 570 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 571 break; 572 573 case AArch64::ORNXrr: 574 case AArch64::ORNWrr: { 575 // not x -> csinv, represented as orn dst, xzr, src. 576 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 577 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 578 return 0; 579 SrcOpNum = 2; 580 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 581 break; 582 } 583 584 case AArch64::SUBSXrr: 585 case AArch64::SUBSWrr: 586 // if NZCV is used, do not fold. 587 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 588 return 0; 589 // fall-through to SUBXrr and SUBWrr. 590 LLVM_FALLTHROUGH; 591 case AArch64::SUBXrr: 592 case AArch64::SUBWrr: { 593 // neg x -> csneg, represented as sub dst, xzr, src. 594 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 595 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 596 return 0; 597 SrcOpNum = 2; 598 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 599 break; 600 } 601 default: 602 return 0; 603 } 604 assert(Opc && SrcOpNum && "Missing parameters"); 605 606 if (NewVReg) 607 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 608 return Opc; 609 } 610 611 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 612 ArrayRef<MachineOperand> Cond, 613 Register DstReg, Register TrueReg, 614 Register FalseReg, int &CondCycles, 615 int &TrueCycles, 616 int &FalseCycles) const { 617 // Check register classes. 618 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 619 const TargetRegisterClass *RC = 620 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 621 if (!RC) 622 return false; 623 624 // Also need to check the dest regclass, in case we're trying to optimize 625 // something like: 626 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 627 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 628 return false; 629 630 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 631 unsigned ExtraCondLat = Cond.size() != 1; 632 633 // GPRs are handled by csel. 634 // FIXME: Fold in x+1, -x, and ~x when applicable. 635 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 636 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 637 // Single-cycle csel, csinc, csinv, and csneg. 638 CondCycles = 1 + ExtraCondLat; 639 TrueCycles = FalseCycles = 1; 640 if (canFoldIntoCSel(MRI, TrueReg)) 641 TrueCycles = 0; 642 else if (canFoldIntoCSel(MRI, FalseReg)) 643 FalseCycles = 0; 644 return true; 645 } 646 647 // Scalar floating point is handled by fcsel. 648 // FIXME: Form fabs, fmin, and fmax when applicable. 649 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 650 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 651 CondCycles = 5 + ExtraCondLat; 652 TrueCycles = FalseCycles = 2; 653 return true; 654 } 655 656 // Can't do vectors. 657 return false; 658 } 659 660 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 661 MachineBasicBlock::iterator I, 662 const DebugLoc &DL, Register DstReg, 663 ArrayRef<MachineOperand> Cond, 664 Register TrueReg, Register FalseReg) const { 665 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 666 667 // Parse the condition code, see parseCondBranch() above. 668 AArch64CC::CondCode CC; 669 switch (Cond.size()) { 670 default: 671 llvm_unreachable("Unknown condition opcode in Cond"); 672 case 1: // b.cc 673 CC = AArch64CC::CondCode(Cond[0].getImm()); 674 break; 675 case 3: { // cbz/cbnz 676 // We must insert a compare against 0. 677 bool Is64Bit; 678 switch (Cond[1].getImm()) { 679 default: 680 llvm_unreachable("Unknown branch opcode in Cond"); 681 case AArch64::CBZW: 682 Is64Bit = false; 683 CC = AArch64CC::EQ; 684 break; 685 case AArch64::CBZX: 686 Is64Bit = true; 687 CC = AArch64CC::EQ; 688 break; 689 case AArch64::CBNZW: 690 Is64Bit = false; 691 CC = AArch64CC::NE; 692 break; 693 case AArch64::CBNZX: 694 Is64Bit = true; 695 CC = AArch64CC::NE; 696 break; 697 } 698 Register SrcReg = Cond[2].getReg(); 699 if (Is64Bit) { 700 // cmp reg, #0 is actually subs xzr, reg, #0. 701 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 702 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 703 .addReg(SrcReg) 704 .addImm(0) 705 .addImm(0); 706 } else { 707 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 708 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 709 .addReg(SrcReg) 710 .addImm(0) 711 .addImm(0); 712 } 713 break; 714 } 715 case 4: { // tbz/tbnz 716 // We must insert a tst instruction. 717 switch (Cond[1].getImm()) { 718 default: 719 llvm_unreachable("Unknown branch opcode in Cond"); 720 case AArch64::TBZW: 721 case AArch64::TBZX: 722 CC = AArch64CC::EQ; 723 break; 724 case AArch64::TBNZW: 725 case AArch64::TBNZX: 726 CC = AArch64CC::NE; 727 break; 728 } 729 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 730 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 731 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 732 .addReg(Cond[2].getReg()) 733 .addImm( 734 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 735 else 736 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 737 .addReg(Cond[2].getReg()) 738 .addImm( 739 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 740 break; 741 } 742 } 743 744 unsigned Opc = 0; 745 const TargetRegisterClass *RC = nullptr; 746 bool TryFold = false; 747 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 748 RC = &AArch64::GPR64RegClass; 749 Opc = AArch64::CSELXr; 750 TryFold = true; 751 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 752 RC = &AArch64::GPR32RegClass; 753 Opc = AArch64::CSELWr; 754 TryFold = true; 755 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 756 RC = &AArch64::FPR64RegClass; 757 Opc = AArch64::FCSELDrrr; 758 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 759 RC = &AArch64::FPR32RegClass; 760 Opc = AArch64::FCSELSrrr; 761 } 762 assert(RC && "Unsupported regclass"); 763 764 // Try folding simple instructions into the csel. 765 if (TryFold) { 766 unsigned NewVReg = 0; 767 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 768 if (FoldedOpc) { 769 // The folded opcodes csinc, csinc and csneg apply the operation to 770 // FalseReg, so we need to invert the condition. 771 CC = AArch64CC::getInvertedCondCode(CC); 772 TrueReg = FalseReg; 773 } else 774 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 775 776 // Fold the operation. Leave any dead instructions for DCE to clean up. 777 if (FoldedOpc) { 778 FalseReg = NewVReg; 779 Opc = FoldedOpc; 780 // The extends the live range of NewVReg. 781 MRI.clearKillFlags(NewVReg); 782 } 783 } 784 785 // Pull all virtual register into the appropriate class. 786 MRI.constrainRegClass(TrueReg, RC); 787 MRI.constrainRegClass(FalseReg, RC); 788 789 // Insert the csel. 790 BuildMI(MBB, I, DL, get(Opc), DstReg) 791 .addReg(TrueReg) 792 .addReg(FalseReg) 793 .addImm(CC); 794 } 795 796 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 797 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 798 uint64_t Imm = MI.getOperand(1).getImm(); 799 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 800 uint64_t Encoding; 801 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 802 } 803 804 // FIXME: this implementation should be micro-architecture dependent, so a 805 // micro-architecture target hook should be introduced here in future. 806 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 807 if (!Subtarget.hasCustomCheapAsMoveHandling()) 808 return MI.isAsCheapAsAMove(); 809 810 const unsigned Opcode = MI.getOpcode(); 811 812 // Firstly, check cases gated by features. 813 814 if (Subtarget.hasZeroCycleZeroingFP()) { 815 if (Opcode == AArch64::FMOVH0 || 816 Opcode == AArch64::FMOVS0 || 817 Opcode == AArch64::FMOVD0) 818 return true; 819 } 820 821 if (Subtarget.hasZeroCycleZeroingGP()) { 822 if (Opcode == TargetOpcode::COPY && 823 (MI.getOperand(1).getReg() == AArch64::WZR || 824 MI.getOperand(1).getReg() == AArch64::XZR)) 825 return true; 826 } 827 828 // Secondly, check cases specific to sub-targets. 829 830 if (Subtarget.hasExynosCheapAsMoveHandling()) { 831 if (isExynosCheapAsMove(MI)) 832 return true; 833 834 return MI.isAsCheapAsAMove(); 835 } 836 837 // Finally, check generic cases. 838 839 switch (Opcode) { 840 default: 841 return false; 842 843 // add/sub on register without shift 844 case AArch64::ADDWri: 845 case AArch64::ADDXri: 846 case AArch64::SUBWri: 847 case AArch64::SUBXri: 848 return (MI.getOperand(3).getImm() == 0); 849 850 // logical ops on immediate 851 case AArch64::ANDWri: 852 case AArch64::ANDXri: 853 case AArch64::EORWri: 854 case AArch64::EORXri: 855 case AArch64::ORRWri: 856 case AArch64::ORRXri: 857 return true; 858 859 // logical ops on register without shift 860 case AArch64::ANDWrr: 861 case AArch64::ANDXrr: 862 case AArch64::BICWrr: 863 case AArch64::BICXrr: 864 case AArch64::EONWrr: 865 case AArch64::EONXrr: 866 case AArch64::EORWrr: 867 case AArch64::EORXrr: 868 case AArch64::ORNWrr: 869 case AArch64::ORNXrr: 870 case AArch64::ORRWrr: 871 case AArch64::ORRXrr: 872 return true; 873 874 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 875 // ORRXri, it is as cheap as MOV 876 case AArch64::MOVi32imm: 877 return canBeExpandedToORR(MI, 32); 878 case AArch64::MOVi64imm: 879 return canBeExpandedToORR(MI, 64); 880 } 881 882 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 883 } 884 885 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 886 switch (MI.getOpcode()) { 887 default: 888 return false; 889 890 case AArch64::ADDWrs: 891 case AArch64::ADDXrs: 892 case AArch64::ADDSWrs: 893 case AArch64::ADDSXrs: { 894 unsigned Imm = MI.getOperand(3).getImm(); 895 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 896 if (ShiftVal == 0) 897 return true; 898 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 899 } 900 901 case AArch64::ADDWrx: 902 case AArch64::ADDXrx: 903 case AArch64::ADDXrx64: 904 case AArch64::ADDSWrx: 905 case AArch64::ADDSXrx: 906 case AArch64::ADDSXrx64: { 907 unsigned Imm = MI.getOperand(3).getImm(); 908 switch (AArch64_AM::getArithExtendType(Imm)) { 909 default: 910 return false; 911 case AArch64_AM::UXTB: 912 case AArch64_AM::UXTH: 913 case AArch64_AM::UXTW: 914 case AArch64_AM::UXTX: 915 return AArch64_AM::getArithShiftValue(Imm) <= 4; 916 } 917 } 918 919 case AArch64::SUBWrs: 920 case AArch64::SUBSWrs: { 921 unsigned Imm = MI.getOperand(3).getImm(); 922 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 923 return ShiftVal == 0 || 924 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 925 } 926 927 case AArch64::SUBXrs: 928 case AArch64::SUBSXrs: { 929 unsigned Imm = MI.getOperand(3).getImm(); 930 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 931 return ShiftVal == 0 || 932 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 933 } 934 935 case AArch64::SUBWrx: 936 case AArch64::SUBXrx: 937 case AArch64::SUBXrx64: 938 case AArch64::SUBSWrx: 939 case AArch64::SUBSXrx: 940 case AArch64::SUBSXrx64: { 941 unsigned Imm = MI.getOperand(3).getImm(); 942 switch (AArch64_AM::getArithExtendType(Imm)) { 943 default: 944 return false; 945 case AArch64_AM::UXTB: 946 case AArch64_AM::UXTH: 947 case AArch64_AM::UXTW: 948 case AArch64_AM::UXTX: 949 return AArch64_AM::getArithShiftValue(Imm) == 0; 950 } 951 } 952 953 case AArch64::LDRBBroW: 954 case AArch64::LDRBBroX: 955 case AArch64::LDRBroW: 956 case AArch64::LDRBroX: 957 case AArch64::LDRDroW: 958 case AArch64::LDRDroX: 959 case AArch64::LDRHHroW: 960 case AArch64::LDRHHroX: 961 case AArch64::LDRHroW: 962 case AArch64::LDRHroX: 963 case AArch64::LDRQroW: 964 case AArch64::LDRQroX: 965 case AArch64::LDRSBWroW: 966 case AArch64::LDRSBWroX: 967 case AArch64::LDRSBXroW: 968 case AArch64::LDRSBXroX: 969 case AArch64::LDRSHWroW: 970 case AArch64::LDRSHWroX: 971 case AArch64::LDRSHXroW: 972 case AArch64::LDRSHXroX: 973 case AArch64::LDRSWroW: 974 case AArch64::LDRSWroX: 975 case AArch64::LDRSroW: 976 case AArch64::LDRSroX: 977 case AArch64::LDRWroW: 978 case AArch64::LDRWroX: 979 case AArch64::LDRXroW: 980 case AArch64::LDRXroX: 981 case AArch64::PRFMroW: 982 case AArch64::PRFMroX: 983 case AArch64::STRBBroW: 984 case AArch64::STRBBroX: 985 case AArch64::STRBroW: 986 case AArch64::STRBroX: 987 case AArch64::STRDroW: 988 case AArch64::STRDroX: 989 case AArch64::STRHHroW: 990 case AArch64::STRHHroX: 991 case AArch64::STRHroW: 992 case AArch64::STRHroX: 993 case AArch64::STRQroW: 994 case AArch64::STRQroX: 995 case AArch64::STRSroW: 996 case AArch64::STRSroX: 997 case AArch64::STRWroW: 998 case AArch64::STRWroX: 999 case AArch64::STRXroW: 1000 case AArch64::STRXroX: { 1001 unsigned IsSigned = MI.getOperand(3).getImm(); 1002 return !IsSigned; 1003 } 1004 } 1005 } 1006 1007 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1008 unsigned Opc = MI.getOpcode(); 1009 switch (Opc) { 1010 default: 1011 return false; 1012 case AArch64::SEH_StackAlloc: 1013 case AArch64::SEH_SaveFPLR: 1014 case AArch64::SEH_SaveFPLR_X: 1015 case AArch64::SEH_SaveReg: 1016 case AArch64::SEH_SaveReg_X: 1017 case AArch64::SEH_SaveRegP: 1018 case AArch64::SEH_SaveRegP_X: 1019 case AArch64::SEH_SaveFReg: 1020 case AArch64::SEH_SaveFReg_X: 1021 case AArch64::SEH_SaveFRegP: 1022 case AArch64::SEH_SaveFRegP_X: 1023 case AArch64::SEH_SetFP: 1024 case AArch64::SEH_AddFP: 1025 case AArch64::SEH_Nop: 1026 case AArch64::SEH_PrologEnd: 1027 case AArch64::SEH_EpilogStart: 1028 case AArch64::SEH_EpilogEnd: 1029 return true; 1030 } 1031 } 1032 1033 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1034 Register &SrcReg, Register &DstReg, 1035 unsigned &SubIdx) const { 1036 switch (MI.getOpcode()) { 1037 default: 1038 return false; 1039 case AArch64::SBFMXri: // aka sxtw 1040 case AArch64::UBFMXri: // aka uxtw 1041 // Check for the 32 -> 64 bit extension case, these instructions can do 1042 // much more. 1043 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1044 return false; 1045 // This is a signed or unsigned 32 -> 64 bit extension. 1046 SrcReg = MI.getOperand(1).getReg(); 1047 DstReg = MI.getOperand(0).getReg(); 1048 SubIdx = AArch64::sub_32; 1049 return true; 1050 } 1051 } 1052 1053 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1054 const MachineInstr &MIa, const MachineInstr &MIb) const { 1055 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1056 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1057 int64_t OffsetA = 0, OffsetB = 0; 1058 unsigned WidthA = 0, WidthB = 0; 1059 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1060 1061 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1062 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1063 1064 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1065 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1066 return false; 1067 1068 // Retrieve the base, offset from the base and width. Width 1069 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1070 // base are identical, and the offset of a lower memory access + 1071 // the width doesn't overlap the offset of a higher memory access, 1072 // then the memory accesses are different. 1073 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1074 // are assumed to have the same scale (vscale). 1075 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1076 WidthA, TRI) && 1077 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1078 WidthB, TRI)) { 1079 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1080 OffsetAIsScalable == OffsetBIsScalable) { 1081 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1082 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1083 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1084 if (LowOffset + LowWidth <= HighOffset) 1085 return true; 1086 } 1087 } 1088 return false; 1089 } 1090 1091 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1092 const MachineBasicBlock *MBB, 1093 const MachineFunction &MF) const { 1094 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1095 return true; 1096 switch (MI.getOpcode()) { 1097 case AArch64::HINT: 1098 // CSDB hints are scheduling barriers. 1099 if (MI.getOperand(0).getImm() == 0x14) 1100 return true; 1101 break; 1102 case AArch64::DSB: 1103 case AArch64::ISB: 1104 // DSB and ISB also are scheduling barriers. 1105 return true; 1106 default:; 1107 } 1108 return isSEHInstruction(MI); 1109 } 1110 1111 /// analyzeCompare - For a comparison instruction, return the source registers 1112 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1113 /// Return true if the comparison instruction can be analyzed. 1114 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1115 Register &SrcReg2, int &CmpMask, 1116 int &CmpValue) const { 1117 // The first operand can be a frame index where we'd normally expect a 1118 // register. 1119 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1120 if (!MI.getOperand(1).isReg()) 1121 return false; 1122 1123 auto NormalizeCmpValue = [](int64_t Value) -> int { 1124 // Comparison immediates may be 64-bit, but CmpValue is only an int. 1125 // Normalize to 0/1/2 return value, where 2 indicates any value apart from 1126 // 0 or 1. 1127 // TODO: Switch CmpValue to int64_t in the API to avoid this. 1128 if (Value == 0 || Value == 1) 1129 return Value; 1130 return 2; 1131 }; 1132 1133 switch (MI.getOpcode()) { 1134 default: 1135 break; 1136 case AArch64::PTEST_PP: 1137 SrcReg = MI.getOperand(0).getReg(); 1138 SrcReg2 = MI.getOperand(1).getReg(); 1139 // Not sure about the mask and value for now... 1140 CmpMask = ~0; 1141 CmpValue = 0; 1142 return true; 1143 case AArch64::SUBSWrr: 1144 case AArch64::SUBSWrs: 1145 case AArch64::SUBSWrx: 1146 case AArch64::SUBSXrr: 1147 case AArch64::SUBSXrs: 1148 case AArch64::SUBSXrx: 1149 case AArch64::ADDSWrr: 1150 case AArch64::ADDSWrs: 1151 case AArch64::ADDSWrx: 1152 case AArch64::ADDSXrr: 1153 case AArch64::ADDSXrs: 1154 case AArch64::ADDSXrx: 1155 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1156 SrcReg = MI.getOperand(1).getReg(); 1157 SrcReg2 = MI.getOperand(2).getReg(); 1158 CmpMask = ~0; 1159 CmpValue = 0; 1160 return true; 1161 case AArch64::SUBSWri: 1162 case AArch64::ADDSWri: 1163 case AArch64::SUBSXri: 1164 case AArch64::ADDSXri: 1165 SrcReg = MI.getOperand(1).getReg(); 1166 SrcReg2 = 0; 1167 CmpMask = ~0; 1168 CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm()); 1169 return true; 1170 case AArch64::ANDSWri: 1171 case AArch64::ANDSXri: 1172 // ANDS does not use the same encoding scheme as the others xxxS 1173 // instructions. 1174 SrcReg = MI.getOperand(1).getReg(); 1175 SrcReg2 = 0; 1176 CmpMask = ~0; 1177 CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate( 1178 MI.getOperand(2).getImm(), 1179 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64)); 1180 return true; 1181 } 1182 1183 return false; 1184 } 1185 1186 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1187 MachineBasicBlock *MBB = Instr.getParent(); 1188 assert(MBB && "Can't get MachineBasicBlock here"); 1189 MachineFunction *MF = MBB->getParent(); 1190 assert(MF && "Can't get MachineFunction here"); 1191 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1192 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1193 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1194 1195 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1196 ++OpIdx) { 1197 MachineOperand &MO = Instr.getOperand(OpIdx); 1198 const TargetRegisterClass *OpRegCstraints = 1199 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1200 1201 // If there's no constraint, there's nothing to do. 1202 if (!OpRegCstraints) 1203 continue; 1204 // If the operand is a frame index, there's nothing to do here. 1205 // A frame index operand will resolve correctly during PEI. 1206 if (MO.isFI()) 1207 continue; 1208 1209 assert(MO.isReg() && 1210 "Operand has register constraints without being a register!"); 1211 1212 Register Reg = MO.getReg(); 1213 if (Register::isPhysicalRegister(Reg)) { 1214 if (!OpRegCstraints->contains(Reg)) 1215 return false; 1216 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1217 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1218 return false; 1219 } 1220 1221 return true; 1222 } 1223 1224 /// Return the opcode that does not set flags when possible - otherwise 1225 /// return the original opcode. The caller is responsible to do the actual 1226 /// substitution and legality checking. 1227 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1228 // Don't convert all compare instructions, because for some the zero register 1229 // encoding becomes the sp register. 1230 bool MIDefinesZeroReg = false; 1231 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1232 MIDefinesZeroReg = true; 1233 1234 switch (MI.getOpcode()) { 1235 default: 1236 return MI.getOpcode(); 1237 case AArch64::ADDSWrr: 1238 return AArch64::ADDWrr; 1239 case AArch64::ADDSWri: 1240 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1241 case AArch64::ADDSWrs: 1242 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1243 case AArch64::ADDSWrx: 1244 return AArch64::ADDWrx; 1245 case AArch64::ADDSXrr: 1246 return AArch64::ADDXrr; 1247 case AArch64::ADDSXri: 1248 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1249 case AArch64::ADDSXrs: 1250 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1251 case AArch64::ADDSXrx: 1252 return AArch64::ADDXrx; 1253 case AArch64::SUBSWrr: 1254 return AArch64::SUBWrr; 1255 case AArch64::SUBSWri: 1256 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1257 case AArch64::SUBSWrs: 1258 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1259 case AArch64::SUBSWrx: 1260 return AArch64::SUBWrx; 1261 case AArch64::SUBSXrr: 1262 return AArch64::SUBXrr; 1263 case AArch64::SUBSXri: 1264 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1265 case AArch64::SUBSXrs: 1266 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1267 case AArch64::SUBSXrx: 1268 return AArch64::SUBXrx; 1269 } 1270 } 1271 1272 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1273 1274 /// True when condition flags are accessed (either by writing or reading) 1275 /// on the instruction trace starting at From and ending at To. 1276 /// 1277 /// Note: If From and To are from different blocks it's assumed CC are accessed 1278 /// on the path. 1279 static bool areCFlagsAccessedBetweenInstrs( 1280 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1281 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1282 // Early exit if To is at the beginning of the BB. 1283 if (To == To->getParent()->begin()) 1284 return true; 1285 1286 // Check whether the instructions are in the same basic block 1287 // If not, assume the condition flags might get modified somewhere. 1288 if (To->getParent() != From->getParent()) 1289 return true; 1290 1291 // From must be above To. 1292 assert(std::any_of( 1293 ++To.getReverse(), To->getParent()->rend(), 1294 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1295 1296 // We iterate backward starting at \p To until we hit \p From. 1297 for (const MachineInstr &Instr : 1298 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1299 if (((AccessToCheck & AK_Write) && 1300 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1301 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1302 return true; 1303 } 1304 return false; 1305 } 1306 1307 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1308 /// operation which could set the flags in an identical manner 1309 bool AArch64InstrInfo::optimizePTestInstr( 1310 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1311 const MachineRegisterInfo *MRI) const { 1312 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1313 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1314 auto NewOp = Pred->getOpcode(); 1315 bool OpChanged = false; 1316 1317 unsigned MaskOpcode = Mask->getOpcode(); 1318 unsigned PredOpcode = Pred->getOpcode(); 1319 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1320 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1321 1322 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { 1323 // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't 1324 // deactivate any lanes OTHER_INST might set. 1325 uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); 1326 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1327 1328 // Must be an all active predicate of matching element size. 1329 if ((PredElementSize != MaskElementSize) || 1330 (Mask->getOperand(1).getImm() != 31)) 1331 return false; 1332 1333 // Fallthough to simply remove the PTEST. 1334 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { 1335 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1336 // instruction that sets the flags as PTEST would. 1337 1338 // Fallthough to simply remove the PTEST. 1339 } else if (PredIsPTestLike) { 1340 // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both 1341 // instructions use the same predicate. 1342 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1343 if (Mask != PTestLikeMask) 1344 return false; 1345 1346 // Fallthough to simply remove the PTEST. 1347 } else { 1348 switch (Pred->getOpcode()) { 1349 case AArch64::BRKB_PPzP: 1350 case AArch64::BRKPB_PPzPP: { 1351 // Op 0 is chain, 1 is the mask, 2 the previous predicate to 1352 // propagate, 3 the new predicate. 1353 1354 // Check to see if our mask is the same as the brkpb's. If 1355 // not the resulting flag bits may be different and we 1356 // can't remove the ptest. 1357 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1358 if (Mask != PredMask) 1359 return false; 1360 1361 // Switch to the new opcode 1362 NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP 1363 : AArch64::BRKPBS_PPzPP; 1364 OpChanged = true; 1365 break; 1366 } 1367 case AArch64::BRKN_PPzP: { 1368 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1369 if (Mask != PredMask) 1370 return false; 1371 1372 NewOp = AArch64::BRKNS_PPzP; 1373 OpChanged = true; 1374 break; 1375 } 1376 case AArch64::RDFFR_PPz: { 1377 // rdffr p1.b, PredMask=p0/z <--- Definition of Pred 1378 // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use 1379 // `rdffrs p1.b, p0/z` above. 1380 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1381 if (Mask != PredMask) 1382 return false; 1383 1384 NewOp = AArch64::RDFFRS_PPz; 1385 OpChanged = true; 1386 break; 1387 } 1388 default: 1389 // Bail out if we don't recognize the input 1390 return false; 1391 } 1392 } 1393 1394 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1395 1396 // If another instruction between Pred and PTest accesses flags, don't remove 1397 // the ptest or update the earlier instruction to modify them. 1398 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1399 return false; 1400 1401 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1402 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1403 // operand to be replaced with an equivalent instruction that also sets the 1404 // flags. 1405 Pred->setDesc(get(NewOp)); 1406 PTest->eraseFromParent(); 1407 if (OpChanged) { 1408 bool succeeded = UpdateOperandRegClass(*Pred); 1409 (void)succeeded; 1410 assert(succeeded && "Operands have incompatible register classes!"); 1411 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1412 } 1413 1414 // Ensure that the flags def is live. 1415 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1416 unsigned i = 0, e = Pred->getNumOperands(); 1417 for (; i != e; ++i) { 1418 MachineOperand &MO = Pred->getOperand(i); 1419 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1420 MO.setIsDead(false); 1421 break; 1422 } 1423 } 1424 } 1425 return true; 1426 } 1427 1428 /// Try to optimize a compare instruction. A compare instruction is an 1429 /// instruction which produces AArch64::NZCV. It can be truly compare 1430 /// instruction 1431 /// when there are no uses of its destination register. 1432 /// 1433 /// The following steps are tried in order: 1434 /// 1. Convert CmpInstr into an unconditional version. 1435 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1436 /// condition code or an instruction which can be converted into such an 1437 /// instruction. 1438 /// Only comparison with zero is supported. 1439 bool AArch64InstrInfo::optimizeCompareInstr( 1440 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, 1441 int CmpValue, const MachineRegisterInfo *MRI) const { 1442 assert(CmpInstr.getParent()); 1443 assert(MRI); 1444 1445 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1446 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1447 if (DeadNZCVIdx != -1) { 1448 if (CmpInstr.definesRegister(AArch64::WZR) || 1449 CmpInstr.definesRegister(AArch64::XZR)) { 1450 CmpInstr.eraseFromParent(); 1451 return true; 1452 } 1453 unsigned Opc = CmpInstr.getOpcode(); 1454 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1455 if (NewOpc == Opc) 1456 return false; 1457 const MCInstrDesc &MCID = get(NewOpc); 1458 CmpInstr.setDesc(MCID); 1459 CmpInstr.RemoveOperand(DeadNZCVIdx); 1460 bool succeeded = UpdateOperandRegClass(CmpInstr); 1461 (void)succeeded; 1462 assert(succeeded && "Some operands reg class are incompatible!"); 1463 return true; 1464 } 1465 1466 if (CmpInstr.getOpcode() == AArch64::PTEST_PP) 1467 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1468 1469 // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1. 1470 assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) && 1471 "CmpValue must be 0, 1, or 2!"); 1472 if (SrcReg2 != 0) 1473 return false; 1474 1475 // CmpInstr is a Compare instruction if destination register is not used. 1476 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1477 return false; 1478 1479 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1480 return true; 1481 return (CmpValue == 0 || CmpValue == 1) && 1482 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1483 } 1484 1485 /// Get opcode of S version of Instr. 1486 /// If Instr is S version its opcode is returned. 1487 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1488 /// or we are not interested in it. 1489 static unsigned sForm(MachineInstr &Instr) { 1490 switch (Instr.getOpcode()) { 1491 default: 1492 return AArch64::INSTRUCTION_LIST_END; 1493 1494 case AArch64::ADDSWrr: 1495 case AArch64::ADDSWri: 1496 case AArch64::ADDSXrr: 1497 case AArch64::ADDSXri: 1498 case AArch64::SUBSWrr: 1499 case AArch64::SUBSWri: 1500 case AArch64::SUBSXrr: 1501 case AArch64::SUBSXri: 1502 return Instr.getOpcode(); 1503 1504 case AArch64::ADDWrr: 1505 return AArch64::ADDSWrr; 1506 case AArch64::ADDWri: 1507 return AArch64::ADDSWri; 1508 case AArch64::ADDXrr: 1509 return AArch64::ADDSXrr; 1510 case AArch64::ADDXri: 1511 return AArch64::ADDSXri; 1512 case AArch64::ADCWr: 1513 return AArch64::ADCSWr; 1514 case AArch64::ADCXr: 1515 return AArch64::ADCSXr; 1516 case AArch64::SUBWrr: 1517 return AArch64::SUBSWrr; 1518 case AArch64::SUBWri: 1519 return AArch64::SUBSWri; 1520 case AArch64::SUBXrr: 1521 return AArch64::SUBSXrr; 1522 case AArch64::SUBXri: 1523 return AArch64::SUBSXri; 1524 case AArch64::SBCWr: 1525 return AArch64::SBCSWr; 1526 case AArch64::SBCXr: 1527 return AArch64::SBCSXr; 1528 case AArch64::ANDWri: 1529 return AArch64::ANDSWri; 1530 case AArch64::ANDXri: 1531 return AArch64::ANDSXri; 1532 } 1533 } 1534 1535 /// Check if AArch64::NZCV should be alive in successors of MBB. 1536 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1537 for (auto *BB : MBB->successors()) 1538 if (BB->isLiveIn(AArch64::NZCV)) 1539 return true; 1540 return false; 1541 } 1542 1543 /// \returns The condition code operand index for \p Instr if it is a branch 1544 /// or select and -1 otherwise. 1545 static int 1546 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1547 switch (Instr.getOpcode()) { 1548 default: 1549 return -1; 1550 1551 case AArch64::Bcc: { 1552 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1553 assert(Idx >= 2); 1554 return Idx - 2; 1555 } 1556 1557 case AArch64::CSINVWr: 1558 case AArch64::CSINVXr: 1559 case AArch64::CSINCWr: 1560 case AArch64::CSINCXr: 1561 case AArch64::CSELWr: 1562 case AArch64::CSELXr: 1563 case AArch64::CSNEGWr: 1564 case AArch64::CSNEGXr: 1565 case AArch64::FCSELSrrr: 1566 case AArch64::FCSELDrrr: { 1567 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1568 assert(Idx >= 1); 1569 return Idx - 1; 1570 } 1571 } 1572 } 1573 1574 namespace { 1575 1576 struct UsedNZCV { 1577 bool N = false; 1578 bool Z = false; 1579 bool C = false; 1580 bool V = false; 1581 1582 UsedNZCV() = default; 1583 1584 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1585 this->N |= UsedFlags.N; 1586 this->Z |= UsedFlags.Z; 1587 this->C |= UsedFlags.C; 1588 this->V |= UsedFlags.V; 1589 return *this; 1590 } 1591 }; 1592 1593 } // end anonymous namespace 1594 1595 /// Find a condition code used by the instruction. 1596 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1597 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1598 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1599 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1600 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1601 Instr.getOperand(CCIdx).getImm()) 1602 : AArch64CC::Invalid; 1603 } 1604 1605 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1606 assert(CC != AArch64CC::Invalid); 1607 UsedNZCV UsedFlags; 1608 switch (CC) { 1609 default: 1610 break; 1611 1612 case AArch64CC::EQ: // Z set 1613 case AArch64CC::NE: // Z clear 1614 UsedFlags.Z = true; 1615 break; 1616 1617 case AArch64CC::HI: // Z clear and C set 1618 case AArch64CC::LS: // Z set or C clear 1619 UsedFlags.Z = true; 1620 LLVM_FALLTHROUGH; 1621 case AArch64CC::HS: // C set 1622 case AArch64CC::LO: // C clear 1623 UsedFlags.C = true; 1624 break; 1625 1626 case AArch64CC::MI: // N set 1627 case AArch64CC::PL: // N clear 1628 UsedFlags.N = true; 1629 break; 1630 1631 case AArch64CC::VS: // V set 1632 case AArch64CC::VC: // V clear 1633 UsedFlags.V = true; 1634 break; 1635 1636 case AArch64CC::GT: // Z clear, N and V the same 1637 case AArch64CC::LE: // Z set, N and V differ 1638 UsedFlags.Z = true; 1639 LLVM_FALLTHROUGH; 1640 case AArch64CC::GE: // N and V the same 1641 case AArch64CC::LT: // N and V differ 1642 UsedFlags.N = true; 1643 UsedFlags.V = true; 1644 break; 1645 } 1646 return UsedFlags; 1647 } 1648 1649 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they 1650 /// are not containing C or V flags and NZCV flags are not alive in successors 1651 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise. 1652 /// 1653 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1654 static Optional<UsedNZCV> 1655 examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1656 const TargetRegisterInfo &TRI, 1657 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) { 1658 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1659 if (MI.getParent() != CmpParent) 1660 return None; 1661 1662 if (areCFlagsAliveInSuccessors(CmpParent)) 1663 return None; 1664 1665 UsedNZCV NZCVUsedAfterCmp; 1666 for (MachineInstr &Instr : instructionsWithoutDebug( 1667 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1668 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1669 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1670 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1671 return None; 1672 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1673 if (CCUseInstrs) 1674 CCUseInstrs->push_back(&Instr); 1675 } 1676 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1677 break; 1678 } 1679 if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V) 1680 return None; 1681 return NZCVUsedAfterCmp; 1682 } 1683 1684 static bool isADDSRegImm(unsigned Opcode) { 1685 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1686 } 1687 1688 static bool isSUBSRegImm(unsigned Opcode) { 1689 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1690 } 1691 1692 /// Check if CmpInstr can be substituted by MI. 1693 /// 1694 /// CmpInstr can be substituted: 1695 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1696 /// - and, MI and CmpInstr are from the same MachineBB 1697 /// - and, condition flags are not alive in successors of the CmpInstr parent 1698 /// - and, if MI opcode is the S form there must be no defs of flags between 1699 /// MI and CmpInstr 1700 /// or if MI opcode is not the S form there must be neither defs of flags 1701 /// nor uses of flags between MI and CmpInstr. 1702 /// - and C/V flags are not used after CmpInstr 1703 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1704 const TargetRegisterInfo &TRI) { 1705 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1706 1707 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1708 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1709 return false; 1710 1711 if (!examineCFlagsUse(MI, CmpInstr, TRI)) 1712 return false; 1713 1714 AccessKind AccessToCheck = AK_Write; 1715 if (sForm(MI) != MI.getOpcode()) 1716 AccessToCheck = AK_All; 1717 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1718 } 1719 1720 /// Substitute an instruction comparing to zero with another instruction 1721 /// which produces needed condition flags. 1722 /// 1723 /// Return true on success. 1724 bool AArch64InstrInfo::substituteCmpToZero( 1725 MachineInstr &CmpInstr, unsigned SrcReg, 1726 const MachineRegisterInfo &MRI) const { 1727 // Get the unique definition of SrcReg. 1728 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1729 if (!MI) 1730 return false; 1731 1732 const TargetRegisterInfo &TRI = getRegisterInfo(); 1733 1734 unsigned NewOpc = sForm(*MI); 1735 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1736 return false; 1737 1738 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1739 return false; 1740 1741 // Update the instruction to set NZCV. 1742 MI->setDesc(get(NewOpc)); 1743 CmpInstr.eraseFromParent(); 1744 bool succeeded = UpdateOperandRegClass(*MI); 1745 (void)succeeded; 1746 assert(succeeded && "Some operands reg class are incompatible!"); 1747 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1748 return true; 1749 } 1750 1751 /// \returns True if \p CmpInstr can be removed. 1752 /// 1753 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1754 /// codes used in \p CCUseInstrs must be inverted. 1755 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1756 int CmpValue, const TargetRegisterInfo &TRI, 1757 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1758 bool &IsInvertCC) { 1759 assert((CmpValue == 0 || CmpValue == 1) && 1760 "Only comparisons to 0 or 1 considered for removal!"); 1761 1762 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1763 unsigned MIOpc = MI.getOpcode(); 1764 if (MIOpc == AArch64::CSINCWr) { 1765 if (MI.getOperand(1).getReg() != AArch64::WZR || 1766 MI.getOperand(2).getReg() != AArch64::WZR) 1767 return false; 1768 } else if (MIOpc == AArch64::CSINCXr) { 1769 if (MI.getOperand(1).getReg() != AArch64::XZR || 1770 MI.getOperand(2).getReg() != AArch64::XZR) 1771 return false; 1772 } else { 1773 return false; 1774 } 1775 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1776 if (MICC == AArch64CC::Invalid) 1777 return false; 1778 1779 // NZCV needs to be defined 1780 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1781 return false; 1782 1783 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1784 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1785 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1786 if (CmpValue && !IsSubsRegImm) 1787 return false; 1788 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1789 return false; 1790 1791 // MI conditions allowed: eq, ne, mi, pl 1792 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1793 if (MIUsedNZCV.C || MIUsedNZCV.V) 1794 return false; 1795 1796 Optional<UsedNZCV> NZCVUsedAfterCmp = 1797 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1798 // Condition flags are not used in CmpInstr basic block successors and only 1799 // Z or N flags allowed to be used after CmpInstr within its basic block 1800 if (!NZCVUsedAfterCmp) 1801 return false; 1802 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1803 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1804 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1805 return false; 1806 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1807 if (MIUsedNZCV.N && !CmpValue) 1808 return false; 1809 1810 // There must be no defs of flags between MI and CmpInstr 1811 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1812 return false; 1813 1814 // Condition code is inverted in the following cases: 1815 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1816 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1817 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1818 (!CmpValue && MICC == AArch64CC::NE); 1819 return true; 1820 } 1821 1822 /// Remove comparision in csinc-cmp sequence 1823 /// 1824 /// Examples: 1825 /// 1. \code 1826 /// csinc w9, wzr, wzr, ne 1827 /// cmp w9, #0 1828 /// b.eq 1829 /// \endcode 1830 /// to 1831 /// \code 1832 /// csinc w9, wzr, wzr, ne 1833 /// b.ne 1834 /// \endcode 1835 /// 1836 /// 2. \code 1837 /// csinc x2, xzr, xzr, mi 1838 /// cmp x2, #1 1839 /// b.pl 1840 /// \endcode 1841 /// to 1842 /// \code 1843 /// csinc x2, xzr, xzr, mi 1844 /// b.pl 1845 /// \endcode 1846 /// 1847 /// \param CmpInstr comparison instruction 1848 /// \return True when comparison removed 1849 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1850 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1851 const MachineRegisterInfo &MRI) const { 1852 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1853 if (!MI) 1854 return false; 1855 const TargetRegisterInfo &TRI = getRegisterInfo(); 1856 SmallVector<MachineInstr *, 4> CCUseInstrs; 1857 bool IsInvertCC = false; 1858 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1859 IsInvertCC)) 1860 return false; 1861 // Make transformation 1862 CmpInstr.eraseFromParent(); 1863 if (IsInvertCC) { 1864 // Invert condition codes in CmpInstr CC users 1865 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1866 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1867 assert(Idx >= 0 && "Unexpected instruction using CC."); 1868 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1869 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1870 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1871 CCOperand.setImm(CCUse); 1872 } 1873 } 1874 return true; 1875 } 1876 1877 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1878 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1879 MI.getOpcode() != AArch64::CATCHRET) 1880 return false; 1881 1882 MachineBasicBlock &MBB = *MI.getParent(); 1883 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1884 auto TRI = Subtarget.getRegisterInfo(); 1885 DebugLoc DL = MI.getDebugLoc(); 1886 1887 if (MI.getOpcode() == AArch64::CATCHRET) { 1888 // Skip to the first instruction before the epilog. 1889 const TargetInstrInfo *TII = 1890 MBB.getParent()->getSubtarget().getInstrInfo(); 1891 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1892 auto MBBI = MachineBasicBlock::iterator(MI); 1893 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1894 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1895 FirstEpilogSEH != MBB.begin()) 1896 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1897 if (FirstEpilogSEH != MBB.begin()) 1898 FirstEpilogSEH = std::next(FirstEpilogSEH); 1899 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1900 .addReg(AArch64::X0, RegState::Define) 1901 .addMBB(TargetMBB); 1902 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1903 .addReg(AArch64::X0, RegState::Define) 1904 .addReg(AArch64::X0) 1905 .addMBB(TargetMBB) 1906 .addImm(0); 1907 return true; 1908 } 1909 1910 Register Reg = MI.getOperand(0).getReg(); 1911 Module &M = *MBB.getParent()->getFunction().getParent(); 1912 if (M.getStackProtectorGuard() == "sysreg") { 1913 const AArch64SysReg::SysReg *SrcReg = 1914 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1915 if (!SrcReg) 1916 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1917 1918 // mrs xN, sysreg 1919 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1920 .addDef(Reg, RegState::Renamable) 1921 .addImm(SrcReg->Encoding); 1922 int Offset = M.getStackProtectorGuardOffset(); 1923 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1924 // ldr xN, [xN, #offset] 1925 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1926 .addDef(Reg) 1927 .addUse(Reg, RegState::Kill) 1928 .addImm(Offset / 8); 1929 } else if (Offset >= -256 && Offset <= 255) { 1930 // ldur xN, [xN, #offset] 1931 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1932 .addDef(Reg) 1933 .addUse(Reg, RegState::Kill) 1934 .addImm(Offset); 1935 } else if (Offset >= -4095 && Offset <= 4095) { 1936 if (Offset > 0) { 1937 // add xN, xN, #offset 1938 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1939 .addDef(Reg) 1940 .addUse(Reg, RegState::Kill) 1941 .addImm(Offset) 1942 .addImm(0); 1943 } else { 1944 // sub xN, xN, #offset 1945 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1946 .addDef(Reg) 1947 .addUse(Reg, RegState::Kill) 1948 .addImm(-Offset) 1949 .addImm(0); 1950 } 1951 // ldr xN, [xN] 1952 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1953 .addDef(Reg) 1954 .addUse(Reg, RegState::Kill) 1955 .addImm(0); 1956 } else { 1957 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1958 // than 23760. 1959 // It might be nice to use AArch64::MOVi32imm here, which would get 1960 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1961 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1962 // AArch64FrameLowering might help us find such a scratch register 1963 // though. If we failed to find a scratch register, we could emit a 1964 // stream of add instructions to build up the immediate. Or, we could try 1965 // to insert a AArch64::MOVi32imm before register allocation so that we 1966 // didn't need to scavenge for a scratch register. 1967 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1968 } 1969 MBB.erase(MI); 1970 return true; 1971 } 1972 1973 const GlobalValue *GV = 1974 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1975 const TargetMachine &TM = MBB.getParent()->getTarget(); 1976 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1977 const unsigned char MO_NC = AArch64II::MO_NC; 1978 1979 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1980 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1981 .addGlobalAddress(GV, 0, OpFlags); 1982 if (Subtarget.isTargetILP32()) { 1983 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1984 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1985 .addDef(Reg32, RegState::Dead) 1986 .addUse(Reg, RegState::Kill) 1987 .addImm(0) 1988 .addMemOperand(*MI.memoperands_begin()) 1989 .addDef(Reg, RegState::Implicit); 1990 } else { 1991 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1992 .addReg(Reg, RegState::Kill) 1993 .addImm(0) 1994 .addMemOperand(*MI.memoperands_begin()); 1995 } 1996 } else if (TM.getCodeModel() == CodeModel::Large) { 1997 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1998 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1999 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 2000 .addImm(0); 2001 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2002 .addReg(Reg, RegState::Kill) 2003 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2004 .addImm(16); 2005 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2006 .addReg(Reg, RegState::Kill) 2007 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2008 .addImm(32); 2009 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2010 .addReg(Reg, RegState::Kill) 2011 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2012 .addImm(48); 2013 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2014 .addReg(Reg, RegState::Kill) 2015 .addImm(0) 2016 .addMemOperand(*MI.memoperands_begin()); 2017 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2018 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2019 .addGlobalAddress(GV, 0, OpFlags); 2020 } else { 2021 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2022 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2023 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2024 if (Subtarget.isTargetILP32()) { 2025 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2026 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2027 .addDef(Reg32, RegState::Dead) 2028 .addUse(Reg, RegState::Kill) 2029 .addGlobalAddress(GV, 0, LoFlags) 2030 .addMemOperand(*MI.memoperands_begin()) 2031 .addDef(Reg, RegState::Implicit); 2032 } else { 2033 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2034 .addReg(Reg, RegState::Kill) 2035 .addGlobalAddress(GV, 0, LoFlags) 2036 .addMemOperand(*MI.memoperands_begin()); 2037 } 2038 } 2039 2040 MBB.erase(MI); 2041 2042 return true; 2043 } 2044 2045 // Return true if this instruction simply sets its single destination register 2046 // to zero. This is equivalent to a register rename of the zero-register. 2047 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2048 switch (MI.getOpcode()) { 2049 default: 2050 break; 2051 case AArch64::MOVZWi: 2052 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2053 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2054 assert(MI.getDesc().getNumOperands() == 3 && 2055 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2056 return true; 2057 } 2058 break; 2059 case AArch64::ANDWri: // and Rd, Rzr, #imm 2060 return MI.getOperand(1).getReg() == AArch64::WZR; 2061 case AArch64::ANDXri: 2062 return MI.getOperand(1).getReg() == AArch64::XZR; 2063 case TargetOpcode::COPY: 2064 return MI.getOperand(1).getReg() == AArch64::WZR; 2065 } 2066 return false; 2067 } 2068 2069 // Return true if this instruction simply renames a general register without 2070 // modifying bits. 2071 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2072 switch (MI.getOpcode()) { 2073 default: 2074 break; 2075 case TargetOpcode::COPY: { 2076 // GPR32 copies will by lowered to ORRXrs 2077 Register DstReg = MI.getOperand(0).getReg(); 2078 return (AArch64::GPR32RegClass.contains(DstReg) || 2079 AArch64::GPR64RegClass.contains(DstReg)); 2080 } 2081 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2082 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2083 assert(MI.getDesc().getNumOperands() == 4 && 2084 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2085 return true; 2086 } 2087 break; 2088 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2089 if (MI.getOperand(2).getImm() == 0) { 2090 assert(MI.getDesc().getNumOperands() == 4 && 2091 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2092 return true; 2093 } 2094 break; 2095 } 2096 return false; 2097 } 2098 2099 // Return true if this instruction simply renames a general register without 2100 // modifying bits. 2101 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2102 switch (MI.getOpcode()) { 2103 default: 2104 break; 2105 case TargetOpcode::COPY: { 2106 // FPR64 copies will by lowered to ORR.16b 2107 Register DstReg = MI.getOperand(0).getReg(); 2108 return (AArch64::FPR64RegClass.contains(DstReg) || 2109 AArch64::FPR128RegClass.contains(DstReg)); 2110 } 2111 case AArch64::ORRv16i8: 2112 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2113 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2114 "invalid ORRv16i8 operands"); 2115 return true; 2116 } 2117 break; 2118 } 2119 return false; 2120 } 2121 2122 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2123 int &FrameIndex) const { 2124 switch (MI.getOpcode()) { 2125 default: 2126 break; 2127 case AArch64::LDRWui: 2128 case AArch64::LDRXui: 2129 case AArch64::LDRBui: 2130 case AArch64::LDRHui: 2131 case AArch64::LDRSui: 2132 case AArch64::LDRDui: 2133 case AArch64::LDRQui: 2134 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2135 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2136 FrameIndex = MI.getOperand(1).getIndex(); 2137 return MI.getOperand(0).getReg(); 2138 } 2139 break; 2140 } 2141 2142 return 0; 2143 } 2144 2145 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2146 int &FrameIndex) const { 2147 switch (MI.getOpcode()) { 2148 default: 2149 break; 2150 case AArch64::STRWui: 2151 case AArch64::STRXui: 2152 case AArch64::STRBui: 2153 case AArch64::STRHui: 2154 case AArch64::STRSui: 2155 case AArch64::STRDui: 2156 case AArch64::STRQui: 2157 case AArch64::LDR_PXI: 2158 case AArch64::STR_PXI: 2159 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2160 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2161 FrameIndex = MI.getOperand(1).getIndex(); 2162 return MI.getOperand(0).getReg(); 2163 } 2164 break; 2165 } 2166 return 0; 2167 } 2168 2169 /// Check all MachineMemOperands for a hint to suppress pairing. 2170 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2171 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2172 return MMO->getFlags() & MOSuppressPair; 2173 }); 2174 } 2175 2176 /// Set a flag on the first MachineMemOperand to suppress pairing. 2177 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2178 if (MI.memoperands_empty()) 2179 return; 2180 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2181 } 2182 2183 /// Check all MachineMemOperands for a hint that the load/store is strided. 2184 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2185 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2186 return MMO->getFlags() & MOStridedAccess; 2187 }); 2188 } 2189 2190 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2191 switch (Opc) { 2192 default: 2193 return false; 2194 case AArch64::STURSi: 2195 case AArch64::STRSpre: 2196 case AArch64::STURDi: 2197 case AArch64::STRDpre: 2198 case AArch64::STURQi: 2199 case AArch64::STRQpre: 2200 case AArch64::STURBBi: 2201 case AArch64::STURHHi: 2202 case AArch64::STURWi: 2203 case AArch64::STRWpre: 2204 case AArch64::STURXi: 2205 case AArch64::STRXpre: 2206 case AArch64::LDURSi: 2207 case AArch64::LDRSpre: 2208 case AArch64::LDURDi: 2209 case AArch64::LDRDpre: 2210 case AArch64::LDURQi: 2211 case AArch64::LDRQpre: 2212 case AArch64::LDURWi: 2213 case AArch64::LDRWpre: 2214 case AArch64::LDURXi: 2215 case AArch64::LDRXpre: 2216 case AArch64::LDURSWi: 2217 case AArch64::LDURHHi: 2218 case AArch64::LDURBBi: 2219 case AArch64::LDURSBWi: 2220 case AArch64::LDURSHWi: 2221 return true; 2222 } 2223 } 2224 2225 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2226 switch (Opc) { 2227 default: return {}; 2228 case AArch64::PRFMui: return AArch64::PRFUMi; 2229 case AArch64::LDRXui: return AArch64::LDURXi; 2230 case AArch64::LDRWui: return AArch64::LDURWi; 2231 case AArch64::LDRBui: return AArch64::LDURBi; 2232 case AArch64::LDRHui: return AArch64::LDURHi; 2233 case AArch64::LDRSui: return AArch64::LDURSi; 2234 case AArch64::LDRDui: return AArch64::LDURDi; 2235 case AArch64::LDRQui: return AArch64::LDURQi; 2236 case AArch64::LDRBBui: return AArch64::LDURBBi; 2237 case AArch64::LDRHHui: return AArch64::LDURHHi; 2238 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2239 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2240 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2241 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2242 case AArch64::LDRSWui: return AArch64::LDURSWi; 2243 case AArch64::STRXui: return AArch64::STURXi; 2244 case AArch64::STRWui: return AArch64::STURWi; 2245 case AArch64::STRBui: return AArch64::STURBi; 2246 case AArch64::STRHui: return AArch64::STURHi; 2247 case AArch64::STRSui: return AArch64::STURSi; 2248 case AArch64::STRDui: return AArch64::STURDi; 2249 case AArch64::STRQui: return AArch64::STURQi; 2250 case AArch64::STRBBui: return AArch64::STURBBi; 2251 case AArch64::STRHHui: return AArch64::STURHHi; 2252 } 2253 } 2254 2255 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2256 switch (Opc) { 2257 default: 2258 return 2; 2259 case AArch64::LDPXi: 2260 case AArch64::LDPDi: 2261 case AArch64::STPXi: 2262 case AArch64::STPDi: 2263 case AArch64::LDNPXi: 2264 case AArch64::LDNPDi: 2265 case AArch64::STNPXi: 2266 case AArch64::STNPDi: 2267 case AArch64::LDPQi: 2268 case AArch64::STPQi: 2269 case AArch64::LDNPQi: 2270 case AArch64::STNPQi: 2271 case AArch64::LDPWi: 2272 case AArch64::LDPSi: 2273 case AArch64::STPWi: 2274 case AArch64::STPSi: 2275 case AArch64::LDNPWi: 2276 case AArch64::LDNPSi: 2277 case AArch64::STNPWi: 2278 case AArch64::STNPSi: 2279 case AArch64::LDG: 2280 case AArch64::STGPi: 2281 case AArch64::LD1B_IMM: 2282 case AArch64::LD1H_IMM: 2283 case AArch64::LD1W_IMM: 2284 case AArch64::LD1D_IMM: 2285 case AArch64::ST1B_IMM: 2286 case AArch64::ST1H_IMM: 2287 case AArch64::ST1W_IMM: 2288 case AArch64::ST1D_IMM: 2289 case AArch64::LD1B_H_IMM: 2290 case AArch64::LD1SB_H_IMM: 2291 case AArch64::LD1H_S_IMM: 2292 case AArch64::LD1SH_S_IMM: 2293 case AArch64::LD1W_D_IMM: 2294 case AArch64::LD1SW_D_IMM: 2295 case AArch64::ST1B_H_IMM: 2296 case AArch64::ST1H_S_IMM: 2297 case AArch64::ST1W_D_IMM: 2298 case AArch64::LD1B_S_IMM: 2299 case AArch64::LD1SB_S_IMM: 2300 case AArch64::LD1H_D_IMM: 2301 case AArch64::LD1SH_D_IMM: 2302 case AArch64::ST1B_S_IMM: 2303 case AArch64::ST1H_D_IMM: 2304 case AArch64::LD1B_D_IMM: 2305 case AArch64::LD1SB_D_IMM: 2306 case AArch64::ST1B_D_IMM: 2307 case AArch64::LD1RB_IMM: 2308 case AArch64::LD1RB_H_IMM: 2309 case AArch64::LD1RB_S_IMM: 2310 case AArch64::LD1RB_D_IMM: 2311 case AArch64::LD1RSB_H_IMM: 2312 case AArch64::LD1RSB_S_IMM: 2313 case AArch64::LD1RSB_D_IMM: 2314 case AArch64::LD1RH_IMM: 2315 case AArch64::LD1RH_S_IMM: 2316 case AArch64::LD1RH_D_IMM: 2317 case AArch64::LD1RSH_S_IMM: 2318 case AArch64::LD1RSH_D_IMM: 2319 case AArch64::LD1RW_IMM: 2320 case AArch64::LD1RW_D_IMM: 2321 case AArch64::LD1RSW_IMM: 2322 case AArch64::LD1RD_IMM: 2323 return 3; 2324 case AArch64::ADDG: 2325 case AArch64::STGOffset: 2326 case AArch64::LDR_PXI: 2327 case AArch64::STR_PXI: 2328 return 2; 2329 } 2330 } 2331 2332 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2333 switch (MI.getOpcode()) { 2334 default: 2335 return false; 2336 // Scaled instructions. 2337 case AArch64::STRSui: 2338 case AArch64::STRDui: 2339 case AArch64::STRQui: 2340 case AArch64::STRXui: 2341 case AArch64::STRWui: 2342 case AArch64::LDRSui: 2343 case AArch64::LDRDui: 2344 case AArch64::LDRQui: 2345 case AArch64::LDRXui: 2346 case AArch64::LDRWui: 2347 case AArch64::LDRSWui: 2348 // Unscaled instructions. 2349 case AArch64::STURSi: 2350 case AArch64::STRSpre: 2351 case AArch64::STURDi: 2352 case AArch64::STRDpre: 2353 case AArch64::STURQi: 2354 case AArch64::STRQpre: 2355 case AArch64::STURWi: 2356 case AArch64::STRWpre: 2357 case AArch64::STURXi: 2358 case AArch64::STRXpre: 2359 case AArch64::LDURSi: 2360 case AArch64::LDRSpre: 2361 case AArch64::LDURDi: 2362 case AArch64::LDRDpre: 2363 case AArch64::LDURQi: 2364 case AArch64::LDRQpre: 2365 case AArch64::LDURWi: 2366 case AArch64::LDRWpre: 2367 case AArch64::LDURXi: 2368 case AArch64::LDRXpre: 2369 case AArch64::LDURSWi: 2370 return true; 2371 } 2372 } 2373 2374 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 2375 bool &Is64Bit) { 2376 switch (Opc) { 2377 default: 2378 llvm_unreachable("Opcode has no flag setting equivalent!"); 2379 // 32-bit cases: 2380 case AArch64::ADDWri: 2381 Is64Bit = false; 2382 return AArch64::ADDSWri; 2383 case AArch64::ADDWrr: 2384 Is64Bit = false; 2385 return AArch64::ADDSWrr; 2386 case AArch64::ADDWrs: 2387 Is64Bit = false; 2388 return AArch64::ADDSWrs; 2389 case AArch64::ADDWrx: 2390 Is64Bit = false; 2391 return AArch64::ADDSWrx; 2392 case AArch64::ANDWri: 2393 Is64Bit = false; 2394 return AArch64::ANDSWri; 2395 case AArch64::ANDWrr: 2396 Is64Bit = false; 2397 return AArch64::ANDSWrr; 2398 case AArch64::ANDWrs: 2399 Is64Bit = false; 2400 return AArch64::ANDSWrs; 2401 case AArch64::BICWrr: 2402 Is64Bit = false; 2403 return AArch64::BICSWrr; 2404 case AArch64::BICWrs: 2405 Is64Bit = false; 2406 return AArch64::BICSWrs; 2407 case AArch64::SUBWri: 2408 Is64Bit = false; 2409 return AArch64::SUBSWri; 2410 case AArch64::SUBWrr: 2411 Is64Bit = false; 2412 return AArch64::SUBSWrr; 2413 case AArch64::SUBWrs: 2414 Is64Bit = false; 2415 return AArch64::SUBSWrs; 2416 case AArch64::SUBWrx: 2417 Is64Bit = false; 2418 return AArch64::SUBSWrx; 2419 // 64-bit cases: 2420 case AArch64::ADDXri: 2421 Is64Bit = true; 2422 return AArch64::ADDSXri; 2423 case AArch64::ADDXrr: 2424 Is64Bit = true; 2425 return AArch64::ADDSXrr; 2426 case AArch64::ADDXrs: 2427 Is64Bit = true; 2428 return AArch64::ADDSXrs; 2429 case AArch64::ADDXrx: 2430 Is64Bit = true; 2431 return AArch64::ADDSXrx; 2432 case AArch64::ANDXri: 2433 Is64Bit = true; 2434 return AArch64::ANDSXri; 2435 case AArch64::ANDXrr: 2436 Is64Bit = true; 2437 return AArch64::ANDSXrr; 2438 case AArch64::ANDXrs: 2439 Is64Bit = true; 2440 return AArch64::ANDSXrs; 2441 case AArch64::BICXrr: 2442 Is64Bit = true; 2443 return AArch64::BICSXrr; 2444 case AArch64::BICXrs: 2445 Is64Bit = true; 2446 return AArch64::BICSXrs; 2447 case AArch64::SUBXri: 2448 Is64Bit = true; 2449 return AArch64::SUBSXri; 2450 case AArch64::SUBXrr: 2451 Is64Bit = true; 2452 return AArch64::SUBSXrr; 2453 case AArch64::SUBXrs: 2454 Is64Bit = true; 2455 return AArch64::SUBSXrs; 2456 case AArch64::SUBXrx: 2457 Is64Bit = true; 2458 return AArch64::SUBSXrx; 2459 } 2460 } 2461 2462 // Is this a candidate for ld/st merging or pairing? For example, we don't 2463 // touch volatiles or load/stores that have a hint to avoid pair formation. 2464 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2465 2466 bool IsPreLdSt = isPreLdSt(MI); 2467 2468 // If this is a volatile load/store, don't mess with it. 2469 if (MI.hasOrderedMemoryRef()) 2470 return false; 2471 2472 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2473 // For Pre-inc LD/ST, the operand is shifted by one. 2474 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2475 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2476 "Expected a reg or frame index operand."); 2477 2478 // For Pre-indexed addressing quadword instructions, the third operand is the 2479 // immediate value. 2480 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2481 2482 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2483 return false; 2484 2485 // Can't merge/pair if the instruction modifies the base register. 2486 // e.g., ldr x0, [x0] 2487 // This case will never occur with an FI base. 2488 // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged. 2489 // For example: 2490 // ldr q0, [x11, #32]! 2491 // ldr q1, [x11, #16] 2492 // to 2493 // ldp q0, q1, [x11, #32]! 2494 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2495 Register BaseReg = MI.getOperand(1).getReg(); 2496 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2497 if (MI.modifiesRegister(BaseReg, TRI)) 2498 return false; 2499 } 2500 2501 // Check if this load/store has a hint to avoid pair formation. 2502 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2503 if (isLdStPairSuppressed(MI)) 2504 return false; 2505 2506 // Do not pair any callee-save store/reload instructions in the 2507 // prologue/epilogue if the CFI information encoded the operations as separate 2508 // instructions, as that will cause the size of the actual prologue to mismatch 2509 // with the prologue size recorded in the Windows CFI. 2510 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2511 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2512 MI.getMF()->getFunction().needsUnwindTableEntry(); 2513 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2514 MI.getFlag(MachineInstr::FrameDestroy))) 2515 return false; 2516 2517 // On some CPUs quad load/store pairs are slower than two single load/stores. 2518 if (Subtarget.isPaired128Slow()) { 2519 switch (MI.getOpcode()) { 2520 default: 2521 break; 2522 case AArch64::LDURQi: 2523 case AArch64::STURQi: 2524 case AArch64::LDRQui: 2525 case AArch64::STRQui: 2526 return false; 2527 } 2528 } 2529 2530 return true; 2531 } 2532 2533 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2534 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2535 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2536 const TargetRegisterInfo *TRI) const { 2537 if (!LdSt.mayLoadOrStore()) 2538 return false; 2539 2540 const MachineOperand *BaseOp; 2541 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2542 Width, TRI)) 2543 return false; 2544 BaseOps.push_back(BaseOp); 2545 return true; 2546 } 2547 2548 Optional<ExtAddrMode> 2549 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2550 const TargetRegisterInfo *TRI) const { 2551 const MachineOperand *Base; // Filled with the base operand of MI. 2552 int64_t Offset; // Filled with the offset of MI. 2553 bool OffsetIsScalable; 2554 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2555 return None; 2556 2557 if (!Base->isReg()) 2558 return None; 2559 ExtAddrMode AM; 2560 AM.BaseReg = Base->getReg(); 2561 AM.Displacement = Offset; 2562 AM.ScaledReg = 0; 2563 return AM; 2564 } 2565 2566 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2567 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2568 bool &OffsetIsScalable, unsigned &Width, 2569 const TargetRegisterInfo *TRI) const { 2570 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2571 // Handle only loads/stores with base register followed by immediate offset. 2572 if (LdSt.getNumExplicitOperands() == 3) { 2573 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2574 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2575 !LdSt.getOperand(2).isImm()) 2576 return false; 2577 } else if (LdSt.getNumExplicitOperands() == 4) { 2578 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2579 if (!LdSt.getOperand(1).isReg() || 2580 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2581 !LdSt.getOperand(3).isImm()) 2582 return false; 2583 } else 2584 return false; 2585 2586 // Get the scaling factor for the instruction and set the width for the 2587 // instruction. 2588 TypeSize Scale(0U, false); 2589 int64_t Dummy1, Dummy2; 2590 2591 // If this returns false, then it's an instruction we don't want to handle. 2592 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2593 return false; 2594 2595 // Compute the offset. Offset is calculated as the immediate operand 2596 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2597 // set to 1. 2598 if (LdSt.getNumExplicitOperands() == 3) { 2599 BaseOp = &LdSt.getOperand(1); 2600 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); 2601 } else { 2602 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2603 BaseOp = &LdSt.getOperand(2); 2604 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); 2605 } 2606 OffsetIsScalable = Scale.isScalable(); 2607 2608 if (!BaseOp->isReg() && !BaseOp->isFI()) 2609 return false; 2610 2611 return true; 2612 } 2613 2614 MachineOperand & 2615 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2616 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2617 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2618 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2619 return OfsOp; 2620 } 2621 2622 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2623 unsigned &Width, int64_t &MinOffset, 2624 int64_t &MaxOffset) { 2625 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2626 switch (Opcode) { 2627 // Not a memory operation or something we want to handle. 2628 default: 2629 Scale = TypeSize::Fixed(0); 2630 Width = 0; 2631 MinOffset = MaxOffset = 0; 2632 return false; 2633 case AArch64::STRWpost: 2634 case AArch64::LDRWpost: 2635 Width = 32; 2636 Scale = TypeSize::Fixed(4); 2637 MinOffset = -256; 2638 MaxOffset = 255; 2639 break; 2640 case AArch64::LDURQi: 2641 case AArch64::STURQi: 2642 Width = 16; 2643 Scale = TypeSize::Fixed(1); 2644 MinOffset = -256; 2645 MaxOffset = 255; 2646 break; 2647 case AArch64::PRFUMi: 2648 case AArch64::LDURXi: 2649 case AArch64::LDURDi: 2650 case AArch64::STURXi: 2651 case AArch64::STURDi: 2652 Width = 8; 2653 Scale = TypeSize::Fixed(1); 2654 MinOffset = -256; 2655 MaxOffset = 255; 2656 break; 2657 case AArch64::LDURWi: 2658 case AArch64::LDURSi: 2659 case AArch64::LDURSWi: 2660 case AArch64::STURWi: 2661 case AArch64::STURSi: 2662 Width = 4; 2663 Scale = TypeSize::Fixed(1); 2664 MinOffset = -256; 2665 MaxOffset = 255; 2666 break; 2667 case AArch64::LDURHi: 2668 case AArch64::LDURHHi: 2669 case AArch64::LDURSHXi: 2670 case AArch64::LDURSHWi: 2671 case AArch64::STURHi: 2672 case AArch64::STURHHi: 2673 Width = 2; 2674 Scale = TypeSize::Fixed(1); 2675 MinOffset = -256; 2676 MaxOffset = 255; 2677 break; 2678 case AArch64::LDURBi: 2679 case AArch64::LDURBBi: 2680 case AArch64::LDURSBXi: 2681 case AArch64::LDURSBWi: 2682 case AArch64::STURBi: 2683 case AArch64::STURBBi: 2684 Width = 1; 2685 Scale = TypeSize::Fixed(1); 2686 MinOffset = -256; 2687 MaxOffset = 255; 2688 break; 2689 case AArch64::LDPQi: 2690 case AArch64::LDNPQi: 2691 case AArch64::STPQi: 2692 case AArch64::STNPQi: 2693 Scale = TypeSize::Fixed(16); 2694 Width = 32; 2695 MinOffset = -64; 2696 MaxOffset = 63; 2697 break; 2698 case AArch64::LDRQui: 2699 case AArch64::STRQui: 2700 Scale = TypeSize::Fixed(16); 2701 Width = 16; 2702 MinOffset = 0; 2703 MaxOffset = 4095; 2704 break; 2705 case AArch64::LDPXi: 2706 case AArch64::LDPDi: 2707 case AArch64::LDNPXi: 2708 case AArch64::LDNPDi: 2709 case AArch64::STPXi: 2710 case AArch64::STPDi: 2711 case AArch64::STNPXi: 2712 case AArch64::STNPDi: 2713 Scale = TypeSize::Fixed(8); 2714 Width = 16; 2715 MinOffset = -64; 2716 MaxOffset = 63; 2717 break; 2718 case AArch64::PRFMui: 2719 case AArch64::LDRXui: 2720 case AArch64::LDRDui: 2721 case AArch64::STRXui: 2722 case AArch64::STRDui: 2723 Scale = TypeSize::Fixed(8); 2724 Width = 8; 2725 MinOffset = 0; 2726 MaxOffset = 4095; 2727 break; 2728 case AArch64::StoreSwiftAsyncContext: 2729 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2730 Scale = TypeSize::Fixed(1); 2731 Width = 8; 2732 MinOffset = 0; 2733 MaxOffset = 4095; 2734 break; 2735 case AArch64::LDPWi: 2736 case AArch64::LDPSi: 2737 case AArch64::LDNPWi: 2738 case AArch64::LDNPSi: 2739 case AArch64::STPWi: 2740 case AArch64::STPSi: 2741 case AArch64::STNPWi: 2742 case AArch64::STNPSi: 2743 Scale = TypeSize::Fixed(4); 2744 Width = 8; 2745 MinOffset = -64; 2746 MaxOffset = 63; 2747 break; 2748 case AArch64::LDRWui: 2749 case AArch64::LDRSui: 2750 case AArch64::LDRSWui: 2751 case AArch64::STRWui: 2752 case AArch64::STRSui: 2753 Scale = TypeSize::Fixed(4); 2754 Width = 4; 2755 MinOffset = 0; 2756 MaxOffset = 4095; 2757 break; 2758 case AArch64::LDRHui: 2759 case AArch64::LDRHHui: 2760 case AArch64::LDRSHWui: 2761 case AArch64::LDRSHXui: 2762 case AArch64::STRHui: 2763 case AArch64::STRHHui: 2764 Scale = TypeSize::Fixed(2); 2765 Width = 2; 2766 MinOffset = 0; 2767 MaxOffset = 4095; 2768 break; 2769 case AArch64::LDRBui: 2770 case AArch64::LDRBBui: 2771 case AArch64::LDRSBWui: 2772 case AArch64::LDRSBXui: 2773 case AArch64::STRBui: 2774 case AArch64::STRBBui: 2775 Scale = TypeSize::Fixed(1); 2776 Width = 1; 2777 MinOffset = 0; 2778 MaxOffset = 4095; 2779 break; 2780 case AArch64::STPXpre: 2781 case AArch64::LDPXpost: 2782 case AArch64::STPDpre: 2783 case AArch64::LDPDpost: 2784 Scale = TypeSize::Fixed(8); 2785 Width = 8; 2786 MinOffset = -512; 2787 MaxOffset = 504; 2788 break; 2789 case AArch64::STPQpre: 2790 case AArch64::LDPQpost: 2791 Scale = TypeSize::Fixed(16); 2792 Width = 16; 2793 MinOffset = -1024; 2794 MaxOffset = 1008; 2795 break; 2796 case AArch64::STRXpre: 2797 case AArch64::STRDpre: 2798 case AArch64::LDRXpost: 2799 case AArch64::LDRDpost: 2800 Scale = TypeSize::Fixed(1); 2801 Width = 8; 2802 MinOffset = -256; 2803 MaxOffset = 255; 2804 break; 2805 case AArch64::STRQpre: 2806 case AArch64::LDRQpost: 2807 Scale = TypeSize::Fixed(1); 2808 Width = 16; 2809 MinOffset = -256; 2810 MaxOffset = 255; 2811 break; 2812 case AArch64::ADDG: 2813 Scale = TypeSize::Fixed(16); 2814 Width = 0; 2815 MinOffset = 0; 2816 MaxOffset = 63; 2817 break; 2818 case AArch64::TAGPstack: 2819 Scale = TypeSize::Fixed(16); 2820 Width = 0; 2821 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2822 // of 63 (not 64!). 2823 MinOffset = -63; 2824 MaxOffset = 63; 2825 break; 2826 case AArch64::LDG: 2827 case AArch64::STGOffset: 2828 case AArch64::STZGOffset: 2829 Scale = TypeSize::Fixed(16); 2830 Width = 16; 2831 MinOffset = -256; 2832 MaxOffset = 255; 2833 break; 2834 case AArch64::STR_ZZZZXI: 2835 case AArch64::LDR_ZZZZXI: 2836 Scale = TypeSize::Scalable(16); 2837 Width = SVEMaxBytesPerVector * 4; 2838 MinOffset = -256; 2839 MaxOffset = 252; 2840 break; 2841 case AArch64::STR_ZZZXI: 2842 case AArch64::LDR_ZZZXI: 2843 Scale = TypeSize::Scalable(16); 2844 Width = SVEMaxBytesPerVector * 3; 2845 MinOffset = -256; 2846 MaxOffset = 253; 2847 break; 2848 case AArch64::STR_ZZXI: 2849 case AArch64::LDR_ZZXI: 2850 Scale = TypeSize::Scalable(16); 2851 Width = SVEMaxBytesPerVector * 2; 2852 MinOffset = -256; 2853 MaxOffset = 254; 2854 break; 2855 case AArch64::LDR_PXI: 2856 case AArch64::STR_PXI: 2857 Scale = TypeSize::Scalable(2); 2858 Width = SVEMaxBytesPerVector / 8; 2859 MinOffset = -256; 2860 MaxOffset = 255; 2861 break; 2862 case AArch64::LDR_ZXI: 2863 case AArch64::STR_ZXI: 2864 Scale = TypeSize::Scalable(16); 2865 Width = SVEMaxBytesPerVector; 2866 MinOffset = -256; 2867 MaxOffset = 255; 2868 break; 2869 case AArch64::LD1B_IMM: 2870 case AArch64::LD1H_IMM: 2871 case AArch64::LD1W_IMM: 2872 case AArch64::LD1D_IMM: 2873 case AArch64::ST1B_IMM: 2874 case AArch64::ST1H_IMM: 2875 case AArch64::ST1W_IMM: 2876 case AArch64::ST1D_IMM: 2877 // A full vectors worth of data 2878 // Width = mbytes * elements 2879 Scale = TypeSize::Scalable(16); 2880 Width = SVEMaxBytesPerVector; 2881 MinOffset = -8; 2882 MaxOffset = 7; 2883 break; 2884 case AArch64::LD1B_H_IMM: 2885 case AArch64::LD1SB_H_IMM: 2886 case AArch64::LD1H_S_IMM: 2887 case AArch64::LD1SH_S_IMM: 2888 case AArch64::LD1W_D_IMM: 2889 case AArch64::LD1SW_D_IMM: 2890 case AArch64::ST1B_H_IMM: 2891 case AArch64::ST1H_S_IMM: 2892 case AArch64::ST1W_D_IMM: 2893 // A half vector worth of data 2894 // Width = mbytes * elements 2895 Scale = TypeSize::Scalable(8); 2896 Width = SVEMaxBytesPerVector / 2; 2897 MinOffset = -8; 2898 MaxOffset = 7; 2899 break; 2900 case AArch64::LD1B_S_IMM: 2901 case AArch64::LD1SB_S_IMM: 2902 case AArch64::LD1H_D_IMM: 2903 case AArch64::LD1SH_D_IMM: 2904 case AArch64::ST1B_S_IMM: 2905 case AArch64::ST1H_D_IMM: 2906 // A quarter vector worth of data 2907 // Width = mbytes * elements 2908 Scale = TypeSize::Scalable(4); 2909 Width = SVEMaxBytesPerVector / 4; 2910 MinOffset = -8; 2911 MaxOffset = 7; 2912 break; 2913 case AArch64::LD1B_D_IMM: 2914 case AArch64::LD1SB_D_IMM: 2915 case AArch64::ST1B_D_IMM: 2916 // A eighth vector worth of data 2917 // Width = mbytes * elements 2918 Scale = TypeSize::Scalable(2); 2919 Width = SVEMaxBytesPerVector / 8; 2920 MinOffset = -8; 2921 MaxOffset = 7; 2922 break; 2923 case AArch64::ST2GOffset: 2924 case AArch64::STZ2GOffset: 2925 Scale = TypeSize::Fixed(16); 2926 Width = 32; 2927 MinOffset = -256; 2928 MaxOffset = 255; 2929 break; 2930 case AArch64::STGPi: 2931 Scale = TypeSize::Fixed(16); 2932 Width = 16; 2933 MinOffset = -64; 2934 MaxOffset = 63; 2935 break; 2936 case AArch64::LD1RB_IMM: 2937 case AArch64::LD1RB_H_IMM: 2938 case AArch64::LD1RB_S_IMM: 2939 case AArch64::LD1RB_D_IMM: 2940 case AArch64::LD1RSB_H_IMM: 2941 case AArch64::LD1RSB_S_IMM: 2942 case AArch64::LD1RSB_D_IMM: 2943 Scale = TypeSize::Fixed(1); 2944 Width = 1; 2945 MinOffset = 0; 2946 MaxOffset = 63; 2947 break; 2948 case AArch64::LD1RH_IMM: 2949 case AArch64::LD1RH_S_IMM: 2950 case AArch64::LD1RH_D_IMM: 2951 case AArch64::LD1RSH_S_IMM: 2952 case AArch64::LD1RSH_D_IMM: 2953 Scale = TypeSize::Fixed(2); 2954 Width = 2; 2955 MinOffset = 0; 2956 MaxOffset = 63; 2957 break; 2958 case AArch64::LD1RW_IMM: 2959 case AArch64::LD1RW_D_IMM: 2960 case AArch64::LD1RSW_IMM: 2961 Scale = TypeSize::Fixed(4); 2962 Width = 4; 2963 MinOffset = 0; 2964 MaxOffset = 63; 2965 break; 2966 case AArch64::LD1RD_IMM: 2967 Scale = TypeSize::Fixed(8); 2968 Width = 8; 2969 MinOffset = 0; 2970 MaxOffset = 63; 2971 break; 2972 } 2973 2974 return true; 2975 } 2976 2977 // Scaling factor for unscaled load or store. 2978 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2979 switch (Opc) { 2980 default: 2981 llvm_unreachable("Opcode has unknown scale!"); 2982 case AArch64::LDRBBui: 2983 case AArch64::LDURBBi: 2984 case AArch64::LDRSBWui: 2985 case AArch64::LDURSBWi: 2986 case AArch64::STRBBui: 2987 case AArch64::STURBBi: 2988 return 1; 2989 case AArch64::LDRHHui: 2990 case AArch64::LDURHHi: 2991 case AArch64::LDRSHWui: 2992 case AArch64::LDURSHWi: 2993 case AArch64::STRHHui: 2994 case AArch64::STURHHi: 2995 return 2; 2996 case AArch64::LDRSui: 2997 case AArch64::LDURSi: 2998 case AArch64::LDRSpre: 2999 case AArch64::LDRSWui: 3000 case AArch64::LDURSWi: 3001 case AArch64::LDRWpre: 3002 case AArch64::LDRWui: 3003 case AArch64::LDURWi: 3004 case AArch64::STRSui: 3005 case AArch64::STURSi: 3006 case AArch64::STRSpre: 3007 case AArch64::STRWui: 3008 case AArch64::STURWi: 3009 case AArch64::STRWpre: 3010 case AArch64::LDPSi: 3011 case AArch64::LDPSWi: 3012 case AArch64::LDPWi: 3013 case AArch64::STPSi: 3014 case AArch64::STPWi: 3015 return 4; 3016 case AArch64::LDRDui: 3017 case AArch64::LDURDi: 3018 case AArch64::LDRDpre: 3019 case AArch64::LDRXui: 3020 case AArch64::LDURXi: 3021 case AArch64::LDRXpre: 3022 case AArch64::STRDui: 3023 case AArch64::STURDi: 3024 case AArch64::STRDpre: 3025 case AArch64::STRXui: 3026 case AArch64::STURXi: 3027 case AArch64::STRXpre: 3028 case AArch64::LDPDi: 3029 case AArch64::LDPXi: 3030 case AArch64::STPDi: 3031 case AArch64::STPXi: 3032 return 8; 3033 case AArch64::LDRQui: 3034 case AArch64::LDURQi: 3035 case AArch64::STRQui: 3036 case AArch64::STURQi: 3037 case AArch64::STRQpre: 3038 case AArch64::LDPQi: 3039 case AArch64::LDRQpre: 3040 case AArch64::STPQi: 3041 case AArch64::STGOffset: 3042 case AArch64::STZGOffset: 3043 case AArch64::ST2GOffset: 3044 case AArch64::STZ2GOffset: 3045 case AArch64::STGPi: 3046 return 16; 3047 } 3048 } 3049 3050 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 3051 switch (MI.getOpcode()) { 3052 default: 3053 return false; 3054 case AArch64::LDRWpre: 3055 case AArch64::LDRXpre: 3056 case AArch64::LDRSpre: 3057 case AArch64::LDRDpre: 3058 case AArch64::LDRQpre: 3059 return true; 3060 } 3061 } 3062 3063 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3064 switch (MI.getOpcode()) { 3065 default: 3066 return false; 3067 case AArch64::STRWpre: 3068 case AArch64::STRXpre: 3069 case AArch64::STRSpre: 3070 case AArch64::STRDpre: 3071 case AArch64::STRQpre: 3072 return true; 3073 } 3074 } 3075 3076 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3077 return isPreLd(MI) || isPreSt(MI); 3078 } 3079 3080 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3081 // scaled. 3082 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3083 int Scale = AArch64InstrInfo::getMemScale(Opc); 3084 3085 // If the byte-offset isn't a multiple of the stride, we can't scale this 3086 // offset. 3087 if (Offset % Scale != 0) 3088 return false; 3089 3090 // Convert the byte-offset used by unscaled into an "element" offset used 3091 // by the scaled pair load/store instructions. 3092 Offset /= Scale; 3093 return true; 3094 } 3095 3096 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3097 if (FirstOpc == SecondOpc) 3098 return true; 3099 // We can also pair sign-ext and zero-ext instructions. 3100 switch (FirstOpc) { 3101 default: 3102 return false; 3103 case AArch64::LDRWui: 3104 case AArch64::LDURWi: 3105 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3106 case AArch64::LDRSWui: 3107 case AArch64::LDURSWi: 3108 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3109 } 3110 // These instructions can't be paired based on their opcodes. 3111 return false; 3112 } 3113 3114 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3115 int64_t Offset1, unsigned Opcode1, int FI2, 3116 int64_t Offset2, unsigned Opcode2) { 3117 // Accesses through fixed stack object frame indices may access a different 3118 // fixed stack slot. Check that the object offsets + offsets match. 3119 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3120 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3121 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3122 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3123 // Convert to scaled object offsets. 3124 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3125 if (ObjectOffset1 % Scale1 != 0) 3126 return false; 3127 ObjectOffset1 /= Scale1; 3128 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3129 if (ObjectOffset2 % Scale2 != 0) 3130 return false; 3131 ObjectOffset2 /= Scale2; 3132 ObjectOffset1 += Offset1; 3133 ObjectOffset2 += Offset2; 3134 return ObjectOffset1 + 1 == ObjectOffset2; 3135 } 3136 3137 return FI1 == FI2; 3138 } 3139 3140 /// Detect opportunities for ldp/stp formation. 3141 /// 3142 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3143 bool AArch64InstrInfo::shouldClusterMemOps( 3144 ArrayRef<const MachineOperand *> BaseOps1, 3145 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3146 unsigned NumBytes) const { 3147 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3148 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3149 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3150 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3151 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3152 if (BaseOp1.getType() != BaseOp2.getType()) 3153 return false; 3154 3155 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3156 "Only base registers and frame indices are supported."); 3157 3158 // Check for both base regs and base FI. 3159 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3160 return false; 3161 3162 // Only cluster up to a single pair. 3163 if (NumLoads > 2) 3164 return false; 3165 3166 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3167 return false; 3168 3169 // Can we pair these instructions based on their opcodes? 3170 unsigned FirstOpc = FirstLdSt.getOpcode(); 3171 unsigned SecondOpc = SecondLdSt.getOpcode(); 3172 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3173 return false; 3174 3175 // Can't merge volatiles or load/stores that have a hint to avoid pair 3176 // formation, for example. 3177 if (!isCandidateToMergeOrPair(FirstLdSt) || 3178 !isCandidateToMergeOrPair(SecondLdSt)) 3179 return false; 3180 3181 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3182 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3183 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3184 return false; 3185 3186 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3187 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3188 return false; 3189 3190 // Pairwise instructions have a 7-bit signed offset field. 3191 if (Offset1 > 63 || Offset1 < -64) 3192 return false; 3193 3194 // The caller should already have ordered First/SecondLdSt by offset. 3195 // Note: except for non-equal frame index bases 3196 if (BaseOp1.isFI()) { 3197 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3198 "Caller should have ordered offsets."); 3199 3200 const MachineFrameInfo &MFI = 3201 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3202 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3203 BaseOp2.getIndex(), Offset2, SecondOpc); 3204 } 3205 3206 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3207 3208 return Offset1 + 1 == Offset2; 3209 } 3210 3211 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3212 unsigned Reg, unsigned SubIdx, 3213 unsigned State, 3214 const TargetRegisterInfo *TRI) { 3215 if (!SubIdx) 3216 return MIB.addReg(Reg, State); 3217 3218 if (Register::isPhysicalRegister(Reg)) 3219 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3220 return MIB.addReg(Reg, State, SubIdx); 3221 } 3222 3223 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3224 unsigned NumRegs) { 3225 // We really want the positive remainder mod 32 here, that happens to be 3226 // easily obtainable with a mask. 3227 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3228 } 3229 3230 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3231 MachineBasicBlock::iterator I, 3232 const DebugLoc &DL, MCRegister DestReg, 3233 MCRegister SrcReg, bool KillSrc, 3234 unsigned Opcode, 3235 ArrayRef<unsigned> Indices) const { 3236 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3237 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3238 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3239 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3240 unsigned NumRegs = Indices.size(); 3241 3242 int SubReg = 0, End = NumRegs, Incr = 1; 3243 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3244 SubReg = NumRegs - 1; 3245 End = -1; 3246 Incr = -1; 3247 } 3248 3249 for (; SubReg != End; SubReg += Incr) { 3250 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3251 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3252 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3253 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3254 } 3255 } 3256 3257 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3258 MachineBasicBlock::iterator I, 3259 DebugLoc DL, unsigned DestReg, 3260 unsigned SrcReg, bool KillSrc, 3261 unsigned Opcode, unsigned ZeroReg, 3262 llvm::ArrayRef<unsigned> Indices) const { 3263 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3264 unsigned NumRegs = Indices.size(); 3265 3266 #ifndef NDEBUG 3267 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3268 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3269 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3270 "GPR reg sequences should not be able to overlap"); 3271 #endif 3272 3273 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3274 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3275 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3276 MIB.addReg(ZeroReg); 3277 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3278 MIB.addImm(0); 3279 } 3280 } 3281 3282 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3283 MachineBasicBlock::iterator I, 3284 const DebugLoc &DL, MCRegister DestReg, 3285 MCRegister SrcReg, bool KillSrc) const { 3286 if (AArch64::GPR32spRegClass.contains(DestReg) && 3287 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3288 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3289 3290 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3291 // If either operand is WSP, expand to ADD #0. 3292 if (Subtarget.hasZeroCycleRegMove()) { 3293 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3294 MCRegister DestRegX = TRI->getMatchingSuperReg( 3295 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3296 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3297 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3298 // This instruction is reading and writing X registers. This may upset 3299 // the register scavenger and machine verifier, so we need to indicate 3300 // that we are reading an undefined value from SrcRegX, but a proper 3301 // value from SrcReg. 3302 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3303 .addReg(SrcRegX, RegState::Undef) 3304 .addImm(0) 3305 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3306 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3307 } else { 3308 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3309 .addReg(SrcReg, getKillRegState(KillSrc)) 3310 .addImm(0) 3311 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3312 } 3313 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3314 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3315 .addImm(0) 3316 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3317 } else { 3318 if (Subtarget.hasZeroCycleRegMove()) { 3319 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3320 MCRegister DestRegX = TRI->getMatchingSuperReg( 3321 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3322 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3323 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3324 // This instruction is reading and writing X registers. This may upset 3325 // the register scavenger and machine verifier, so we need to indicate 3326 // that we are reading an undefined value from SrcRegX, but a proper 3327 // value from SrcReg. 3328 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3329 .addReg(AArch64::XZR) 3330 .addReg(SrcRegX, RegState::Undef) 3331 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3332 } else { 3333 // Otherwise, expand to ORR WZR. 3334 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3335 .addReg(AArch64::WZR) 3336 .addReg(SrcReg, getKillRegState(KillSrc)); 3337 } 3338 } 3339 return; 3340 } 3341 3342 // Copy a Predicate register by ORRing with itself. 3343 if (AArch64::PPRRegClass.contains(DestReg) && 3344 AArch64::PPRRegClass.contains(SrcReg)) { 3345 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3346 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3347 .addReg(SrcReg) // Pg 3348 .addReg(SrcReg) 3349 .addReg(SrcReg, getKillRegState(KillSrc)); 3350 return; 3351 } 3352 3353 // Copy a Z register by ORRing with itself. 3354 if (AArch64::ZPRRegClass.contains(DestReg) && 3355 AArch64::ZPRRegClass.contains(SrcReg)) { 3356 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3357 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3358 .addReg(SrcReg) 3359 .addReg(SrcReg, getKillRegState(KillSrc)); 3360 return; 3361 } 3362 3363 // Copy a Z register pair by copying the individual sub-registers. 3364 if (AArch64::ZPR2RegClass.contains(DestReg) && 3365 AArch64::ZPR2RegClass.contains(SrcReg)) { 3366 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3367 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3368 Indices); 3369 return; 3370 } 3371 3372 // Copy a Z register triple by copying the individual sub-registers. 3373 if (AArch64::ZPR3RegClass.contains(DestReg) && 3374 AArch64::ZPR3RegClass.contains(SrcReg)) { 3375 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3376 AArch64::zsub2}; 3377 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3378 Indices); 3379 return; 3380 } 3381 3382 // Copy a Z register quad by copying the individual sub-registers. 3383 if (AArch64::ZPR4RegClass.contains(DestReg) && 3384 AArch64::ZPR4RegClass.contains(SrcReg)) { 3385 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3386 AArch64::zsub2, AArch64::zsub3}; 3387 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3388 Indices); 3389 return; 3390 } 3391 3392 if (AArch64::GPR64spRegClass.contains(DestReg) && 3393 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3394 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3395 // If either operand is SP, expand to ADD #0. 3396 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3397 .addReg(SrcReg, getKillRegState(KillSrc)) 3398 .addImm(0) 3399 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3400 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3401 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3402 .addImm(0) 3403 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3404 } else { 3405 // Otherwise, expand to ORR XZR. 3406 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3407 .addReg(AArch64::XZR) 3408 .addReg(SrcReg, getKillRegState(KillSrc)); 3409 } 3410 return; 3411 } 3412 3413 // Copy a DDDD register quad by copying the individual sub-registers. 3414 if (AArch64::DDDDRegClass.contains(DestReg) && 3415 AArch64::DDDDRegClass.contains(SrcReg)) { 3416 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3417 AArch64::dsub2, AArch64::dsub3}; 3418 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3419 Indices); 3420 return; 3421 } 3422 3423 // Copy a DDD register triple by copying the individual sub-registers. 3424 if (AArch64::DDDRegClass.contains(DestReg) && 3425 AArch64::DDDRegClass.contains(SrcReg)) { 3426 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3427 AArch64::dsub2}; 3428 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3429 Indices); 3430 return; 3431 } 3432 3433 // Copy a DD register pair by copying the individual sub-registers. 3434 if (AArch64::DDRegClass.contains(DestReg) && 3435 AArch64::DDRegClass.contains(SrcReg)) { 3436 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3437 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3438 Indices); 3439 return; 3440 } 3441 3442 // Copy a QQQQ register quad by copying the individual sub-registers. 3443 if (AArch64::QQQQRegClass.contains(DestReg) && 3444 AArch64::QQQQRegClass.contains(SrcReg)) { 3445 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3446 AArch64::qsub2, AArch64::qsub3}; 3447 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3448 Indices); 3449 return; 3450 } 3451 3452 // Copy a QQQ register triple by copying the individual sub-registers. 3453 if (AArch64::QQQRegClass.contains(DestReg) && 3454 AArch64::QQQRegClass.contains(SrcReg)) { 3455 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3456 AArch64::qsub2}; 3457 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3458 Indices); 3459 return; 3460 } 3461 3462 // Copy a QQ register pair by copying the individual sub-registers. 3463 if (AArch64::QQRegClass.contains(DestReg) && 3464 AArch64::QQRegClass.contains(SrcReg)) { 3465 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3466 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3467 Indices); 3468 return; 3469 } 3470 3471 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3472 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3473 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3474 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3475 AArch64::XZR, Indices); 3476 return; 3477 } 3478 3479 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3480 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3481 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3482 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3483 AArch64::WZR, Indices); 3484 return; 3485 } 3486 3487 if (AArch64::FPR128RegClass.contains(DestReg) && 3488 AArch64::FPR128RegClass.contains(SrcReg)) { 3489 if (Subtarget.hasNEON()) { 3490 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3491 .addReg(SrcReg) 3492 .addReg(SrcReg, getKillRegState(KillSrc)); 3493 } else { 3494 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3495 .addReg(AArch64::SP, RegState::Define) 3496 .addReg(SrcReg, getKillRegState(KillSrc)) 3497 .addReg(AArch64::SP) 3498 .addImm(-16); 3499 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3500 .addReg(AArch64::SP, RegState::Define) 3501 .addReg(DestReg, RegState::Define) 3502 .addReg(AArch64::SP) 3503 .addImm(16); 3504 } 3505 return; 3506 } 3507 3508 if (AArch64::FPR64RegClass.contains(DestReg) && 3509 AArch64::FPR64RegClass.contains(SrcReg)) { 3510 if (Subtarget.hasNEON()) { 3511 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 3512 &AArch64::FPR128RegClass); 3513 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 3514 &AArch64::FPR128RegClass); 3515 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3516 .addReg(SrcReg) 3517 .addReg(SrcReg, getKillRegState(KillSrc)); 3518 } else { 3519 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3520 .addReg(SrcReg, getKillRegState(KillSrc)); 3521 } 3522 return; 3523 } 3524 3525 if (AArch64::FPR32RegClass.contains(DestReg) && 3526 AArch64::FPR32RegClass.contains(SrcReg)) { 3527 if (Subtarget.hasNEON()) { 3528 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 3529 &AArch64::FPR128RegClass); 3530 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 3531 &AArch64::FPR128RegClass); 3532 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3533 .addReg(SrcReg) 3534 .addReg(SrcReg, getKillRegState(KillSrc)); 3535 } else { 3536 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3537 .addReg(SrcReg, getKillRegState(KillSrc)); 3538 } 3539 return; 3540 } 3541 3542 if (AArch64::FPR16RegClass.contains(DestReg) && 3543 AArch64::FPR16RegClass.contains(SrcReg)) { 3544 if (Subtarget.hasNEON()) { 3545 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3546 &AArch64::FPR128RegClass); 3547 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3548 &AArch64::FPR128RegClass); 3549 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3550 .addReg(SrcReg) 3551 .addReg(SrcReg, getKillRegState(KillSrc)); 3552 } else { 3553 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3554 &AArch64::FPR32RegClass); 3555 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3556 &AArch64::FPR32RegClass); 3557 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3558 .addReg(SrcReg, getKillRegState(KillSrc)); 3559 } 3560 return; 3561 } 3562 3563 if (AArch64::FPR8RegClass.contains(DestReg) && 3564 AArch64::FPR8RegClass.contains(SrcReg)) { 3565 if (Subtarget.hasNEON()) { 3566 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3567 &AArch64::FPR128RegClass); 3568 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3569 &AArch64::FPR128RegClass); 3570 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3571 .addReg(SrcReg) 3572 .addReg(SrcReg, getKillRegState(KillSrc)); 3573 } else { 3574 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3575 &AArch64::FPR32RegClass); 3576 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3577 &AArch64::FPR32RegClass); 3578 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3579 .addReg(SrcReg, getKillRegState(KillSrc)); 3580 } 3581 return; 3582 } 3583 3584 // Copies between GPR64 and FPR64. 3585 if (AArch64::FPR64RegClass.contains(DestReg) && 3586 AArch64::GPR64RegClass.contains(SrcReg)) { 3587 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3588 .addReg(SrcReg, getKillRegState(KillSrc)); 3589 return; 3590 } 3591 if (AArch64::GPR64RegClass.contains(DestReg) && 3592 AArch64::FPR64RegClass.contains(SrcReg)) { 3593 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3594 .addReg(SrcReg, getKillRegState(KillSrc)); 3595 return; 3596 } 3597 // Copies between GPR32 and FPR32. 3598 if (AArch64::FPR32RegClass.contains(DestReg) && 3599 AArch64::GPR32RegClass.contains(SrcReg)) { 3600 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3601 .addReg(SrcReg, getKillRegState(KillSrc)); 3602 return; 3603 } 3604 if (AArch64::GPR32RegClass.contains(DestReg) && 3605 AArch64::FPR32RegClass.contains(SrcReg)) { 3606 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3607 .addReg(SrcReg, getKillRegState(KillSrc)); 3608 return; 3609 } 3610 3611 if (DestReg == AArch64::NZCV) { 3612 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3613 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3614 .addImm(AArch64SysReg::NZCV) 3615 .addReg(SrcReg, getKillRegState(KillSrc)) 3616 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3617 return; 3618 } 3619 3620 if (SrcReg == AArch64::NZCV) { 3621 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3622 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3623 .addImm(AArch64SysReg::NZCV) 3624 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3625 return; 3626 } 3627 3628 #ifndef NDEBUG 3629 const TargetRegisterInfo &TRI = getRegisterInfo(); 3630 errs() << TRI.getRegAsmName(DestReg) << " = COPY " 3631 << TRI.getRegAsmName(SrcReg) << "\n"; 3632 #endif 3633 llvm_unreachable("unimplemented reg-to-reg copy"); 3634 } 3635 3636 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3637 MachineBasicBlock &MBB, 3638 MachineBasicBlock::iterator InsertBefore, 3639 const MCInstrDesc &MCID, 3640 Register SrcReg, bool IsKill, 3641 unsigned SubIdx0, unsigned SubIdx1, int FI, 3642 MachineMemOperand *MMO) { 3643 Register SrcReg0 = SrcReg; 3644 Register SrcReg1 = SrcReg; 3645 if (Register::isPhysicalRegister(SrcReg)) { 3646 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3647 SubIdx0 = 0; 3648 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3649 SubIdx1 = 0; 3650 } 3651 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3652 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3653 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3654 .addFrameIndex(FI) 3655 .addImm(0) 3656 .addMemOperand(MMO); 3657 } 3658 3659 void AArch64InstrInfo::storeRegToStackSlot( 3660 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, 3661 bool isKill, int FI, const TargetRegisterClass *RC, 3662 const TargetRegisterInfo *TRI) const { 3663 MachineFunction &MF = *MBB.getParent(); 3664 MachineFrameInfo &MFI = MF.getFrameInfo(); 3665 3666 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3667 MachineMemOperand *MMO = 3668 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3669 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3670 unsigned Opc = 0; 3671 bool Offset = true; 3672 unsigned StackID = TargetStackID::Default; 3673 switch (TRI->getSpillSize(*RC)) { 3674 case 1: 3675 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3676 Opc = AArch64::STRBui; 3677 break; 3678 case 2: 3679 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3680 Opc = AArch64::STRHui; 3681 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3682 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3683 Opc = AArch64::STR_PXI; 3684 StackID = TargetStackID::ScalableVector; 3685 } 3686 break; 3687 case 4: 3688 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3689 Opc = AArch64::STRWui; 3690 if (Register::isVirtualRegister(SrcReg)) 3691 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3692 else 3693 assert(SrcReg != AArch64::WSP); 3694 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3695 Opc = AArch64::STRSui; 3696 break; 3697 case 8: 3698 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3699 Opc = AArch64::STRXui; 3700 if (Register::isVirtualRegister(SrcReg)) 3701 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3702 else 3703 assert(SrcReg != AArch64::SP); 3704 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3705 Opc = AArch64::STRDui; 3706 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3707 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3708 get(AArch64::STPWi), SrcReg, isKill, 3709 AArch64::sube32, AArch64::subo32, FI, MMO); 3710 return; 3711 } 3712 break; 3713 case 16: 3714 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3715 Opc = AArch64::STRQui; 3716 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3717 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3718 Opc = AArch64::ST1Twov1d; 3719 Offset = false; 3720 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3721 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3722 get(AArch64::STPXi), SrcReg, isKill, 3723 AArch64::sube64, AArch64::subo64, FI, MMO); 3724 return; 3725 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3726 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3727 Opc = AArch64::STR_ZXI; 3728 StackID = TargetStackID::ScalableVector; 3729 } 3730 break; 3731 case 24: 3732 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3733 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3734 Opc = AArch64::ST1Threev1d; 3735 Offset = false; 3736 } 3737 break; 3738 case 32: 3739 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3740 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3741 Opc = AArch64::ST1Fourv1d; 3742 Offset = false; 3743 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3744 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3745 Opc = AArch64::ST1Twov2d; 3746 Offset = false; 3747 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3748 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3749 Opc = AArch64::STR_ZZXI; 3750 StackID = TargetStackID::ScalableVector; 3751 } 3752 break; 3753 case 48: 3754 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3755 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3756 Opc = AArch64::ST1Threev2d; 3757 Offset = false; 3758 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3759 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3760 Opc = AArch64::STR_ZZZXI; 3761 StackID = TargetStackID::ScalableVector; 3762 } 3763 break; 3764 case 64: 3765 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3766 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3767 Opc = AArch64::ST1Fourv2d; 3768 Offset = false; 3769 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3770 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3771 Opc = AArch64::STR_ZZZZXI; 3772 StackID = TargetStackID::ScalableVector; 3773 } 3774 break; 3775 } 3776 assert(Opc && "Unknown register class"); 3777 MFI.setStackID(FI, StackID); 3778 3779 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3780 .addReg(SrcReg, getKillRegState(isKill)) 3781 .addFrameIndex(FI); 3782 3783 if (Offset) 3784 MI.addImm(0); 3785 MI.addMemOperand(MMO); 3786 } 3787 3788 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3789 MachineBasicBlock &MBB, 3790 MachineBasicBlock::iterator InsertBefore, 3791 const MCInstrDesc &MCID, 3792 Register DestReg, unsigned SubIdx0, 3793 unsigned SubIdx1, int FI, 3794 MachineMemOperand *MMO) { 3795 Register DestReg0 = DestReg; 3796 Register DestReg1 = DestReg; 3797 bool IsUndef = true; 3798 if (Register::isPhysicalRegister(DestReg)) { 3799 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3800 SubIdx0 = 0; 3801 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3802 SubIdx1 = 0; 3803 IsUndef = false; 3804 } 3805 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3806 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3807 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3808 .addFrameIndex(FI) 3809 .addImm(0) 3810 .addMemOperand(MMO); 3811 } 3812 3813 void AArch64InstrInfo::loadRegFromStackSlot( 3814 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 3815 int FI, const TargetRegisterClass *RC, 3816 const TargetRegisterInfo *TRI) const { 3817 MachineFunction &MF = *MBB.getParent(); 3818 MachineFrameInfo &MFI = MF.getFrameInfo(); 3819 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3820 MachineMemOperand *MMO = 3821 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3822 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3823 3824 unsigned Opc = 0; 3825 bool Offset = true; 3826 unsigned StackID = TargetStackID::Default; 3827 switch (TRI->getSpillSize(*RC)) { 3828 case 1: 3829 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3830 Opc = AArch64::LDRBui; 3831 break; 3832 case 2: 3833 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3834 Opc = AArch64::LDRHui; 3835 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3836 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3837 Opc = AArch64::LDR_PXI; 3838 StackID = TargetStackID::ScalableVector; 3839 } 3840 break; 3841 case 4: 3842 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3843 Opc = AArch64::LDRWui; 3844 if (Register::isVirtualRegister(DestReg)) 3845 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3846 else 3847 assert(DestReg != AArch64::WSP); 3848 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3849 Opc = AArch64::LDRSui; 3850 break; 3851 case 8: 3852 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3853 Opc = AArch64::LDRXui; 3854 if (Register::isVirtualRegister(DestReg)) 3855 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3856 else 3857 assert(DestReg != AArch64::SP); 3858 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3859 Opc = AArch64::LDRDui; 3860 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3861 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3862 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3863 AArch64::subo32, FI, MMO); 3864 return; 3865 } 3866 break; 3867 case 16: 3868 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3869 Opc = AArch64::LDRQui; 3870 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3871 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3872 Opc = AArch64::LD1Twov1d; 3873 Offset = false; 3874 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3875 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3876 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3877 AArch64::subo64, FI, MMO); 3878 return; 3879 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3880 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3881 Opc = AArch64::LDR_ZXI; 3882 StackID = TargetStackID::ScalableVector; 3883 } 3884 break; 3885 case 24: 3886 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3887 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3888 Opc = AArch64::LD1Threev1d; 3889 Offset = false; 3890 } 3891 break; 3892 case 32: 3893 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3894 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3895 Opc = AArch64::LD1Fourv1d; 3896 Offset = false; 3897 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3898 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3899 Opc = AArch64::LD1Twov2d; 3900 Offset = false; 3901 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3902 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3903 Opc = AArch64::LDR_ZZXI; 3904 StackID = TargetStackID::ScalableVector; 3905 } 3906 break; 3907 case 48: 3908 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3909 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3910 Opc = AArch64::LD1Threev2d; 3911 Offset = false; 3912 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3913 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3914 Opc = AArch64::LDR_ZZZXI; 3915 StackID = TargetStackID::ScalableVector; 3916 } 3917 break; 3918 case 64: 3919 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3920 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3921 Opc = AArch64::LD1Fourv2d; 3922 Offset = false; 3923 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3924 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3925 Opc = AArch64::LDR_ZZZZXI; 3926 StackID = TargetStackID::ScalableVector; 3927 } 3928 break; 3929 } 3930 3931 assert(Opc && "Unknown register class"); 3932 MFI.setStackID(FI, StackID); 3933 3934 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3935 .addReg(DestReg, getDefRegState(true)) 3936 .addFrameIndex(FI); 3937 if (Offset) 3938 MI.addImm(0); 3939 MI.addMemOperand(MMO); 3940 } 3941 3942 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 3943 const MachineInstr &UseMI, 3944 const TargetRegisterInfo *TRI) { 3945 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 3946 UseMI.getIterator()), 3947 [TRI](const MachineInstr &I) { 3948 return I.modifiesRegister(AArch64::NZCV, TRI) || 3949 I.readsRegister(AArch64::NZCV, TRI); 3950 }); 3951 } 3952 3953 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 3954 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 3955 // The smallest scalable element supported by scaled SVE addressing 3956 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3957 // byte offset must always be a multiple of 2. 3958 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3959 3960 // VGSized offsets are divided by '2', because the VG register is the 3961 // the number of 64bit granules as opposed to 128bit vector chunks, 3962 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 3963 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 3964 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 3965 ByteSized = Offset.getFixed(); 3966 VGSized = Offset.getScalable() / 2; 3967 } 3968 3969 /// Returns the offset in parts to which this frame offset can be 3970 /// decomposed for the purpose of describing a frame offset. 3971 /// For non-scalable offsets this is simply its byte size. 3972 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 3973 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 3974 int64_t &NumDataVectors) { 3975 // The smallest scalable element supported by scaled SVE addressing 3976 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3977 // byte offset must always be a multiple of 2. 3978 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3979 3980 NumBytes = Offset.getFixed(); 3981 NumDataVectors = 0; 3982 NumPredicateVectors = Offset.getScalable() / 2; 3983 // This method is used to get the offsets to adjust the frame offset. 3984 // If the function requires ADDPL to be used and needs more than two ADDPL 3985 // instructions, part of the offset is folded into NumDataVectors so that it 3986 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 3987 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 3988 NumPredicateVectors > 62) { 3989 NumDataVectors = NumPredicateVectors / 8; 3990 NumPredicateVectors -= NumDataVectors * 8; 3991 } 3992 } 3993 3994 // Helper function to emit a frame offset adjustment from a given 3995 // pointer (SrcReg), stored into DestReg. This function is explicit 3996 // in that it requires the opcode. 3997 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3998 MachineBasicBlock::iterator MBBI, 3999 const DebugLoc &DL, unsigned DestReg, 4000 unsigned SrcReg, int64_t Offset, unsigned Opc, 4001 const TargetInstrInfo *TII, 4002 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 4003 bool *HasWinCFI) { 4004 int Sign = 1; 4005 unsigned MaxEncoding, ShiftSize; 4006 switch (Opc) { 4007 case AArch64::ADDXri: 4008 case AArch64::ADDSXri: 4009 case AArch64::SUBXri: 4010 case AArch64::SUBSXri: 4011 MaxEncoding = 0xfff; 4012 ShiftSize = 12; 4013 break; 4014 case AArch64::ADDVL_XXI: 4015 case AArch64::ADDPL_XXI: 4016 MaxEncoding = 31; 4017 ShiftSize = 0; 4018 if (Offset < 0) { 4019 MaxEncoding = 32; 4020 Sign = -1; 4021 Offset = -Offset; 4022 } 4023 break; 4024 default: 4025 llvm_unreachable("Unsupported opcode"); 4026 } 4027 4028 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 4029 // scratch register. If DestReg is a virtual register, use it as the 4030 // scratch register; otherwise, create a new virtual register (to be 4031 // replaced by the scavenger at the end of PEI). That case can be optimized 4032 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 4033 // register can be loaded with offset%8 and the add/sub can use an extending 4034 // instruction with LSL#3. 4035 // Currently the function handles any offsets but generates a poor sequence 4036 // of code. 4037 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 4038 4039 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 4040 Register TmpReg = DestReg; 4041 if (TmpReg == AArch64::XZR) 4042 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 4043 &AArch64::GPR64RegClass); 4044 do { 4045 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 4046 unsigned LocalShiftSize = 0; 4047 if (ThisVal > MaxEncoding) { 4048 ThisVal = ThisVal >> ShiftSize; 4049 LocalShiftSize = ShiftSize; 4050 } 4051 assert((ThisVal >> ShiftSize) <= MaxEncoding && 4052 "Encoding cannot handle value that big"); 4053 4054 Offset -= ThisVal << LocalShiftSize; 4055 if (Offset == 0) 4056 TmpReg = DestReg; 4057 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 4058 .addReg(SrcReg) 4059 .addImm(Sign * (int)ThisVal); 4060 if (ShiftSize) 4061 MBI = MBI.addImm( 4062 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4063 MBI = MBI.setMIFlag(Flag); 4064 4065 if (NeedsWinCFI) { 4066 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4067 int Imm = (int)(ThisVal << LocalShiftSize); 4068 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4069 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4070 if (HasWinCFI) 4071 *HasWinCFI = true; 4072 if (Imm == 0) 4073 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4074 else 4075 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4076 .addImm(Imm) 4077 .setMIFlag(Flag); 4078 assert(Offset == 0 && "Expected remaining offset to be zero to " 4079 "emit a single SEH directive"); 4080 } else if (DestReg == AArch64::SP) { 4081 if (HasWinCFI) 4082 *HasWinCFI = true; 4083 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4084 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4085 .addImm(Imm) 4086 .setMIFlag(Flag); 4087 } 4088 if (HasWinCFI) 4089 *HasWinCFI = true; 4090 } 4091 4092 SrcReg = TmpReg; 4093 } while (Offset); 4094 } 4095 4096 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4097 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4098 unsigned DestReg, unsigned SrcReg, 4099 StackOffset Offset, const TargetInstrInfo *TII, 4100 MachineInstr::MIFlag Flag, bool SetNZCV, 4101 bool NeedsWinCFI, bool *HasWinCFI) { 4102 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4103 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4104 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4105 4106 // First emit non-scalable frame offsets, or a simple 'mov'. 4107 if (Bytes || (!Offset && SrcReg != DestReg)) { 4108 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4109 "SP increment/decrement not 8-byte aligned"); 4110 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4111 if (Bytes < 0) { 4112 Bytes = -Bytes; 4113 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4114 } 4115 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4116 NeedsWinCFI, HasWinCFI); 4117 SrcReg = DestReg; 4118 } 4119 4120 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4121 "SetNZCV not supported with SVE vectors"); 4122 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4123 "WinCFI not supported with SVE vectors"); 4124 4125 if (NumDataVectors) { 4126 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4127 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4128 SrcReg = DestReg; 4129 } 4130 4131 if (NumPredicateVectors) { 4132 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4133 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4134 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4135 } 4136 } 4137 4138 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4139 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4140 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4141 LiveIntervals *LIS, VirtRegMap *VRM) const { 4142 // This is a bit of a hack. Consider this instruction: 4143 // 4144 // %0 = COPY %sp; GPR64all:%0 4145 // 4146 // We explicitly chose GPR64all for the virtual register so such a copy might 4147 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4148 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4149 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4150 // 4151 // To prevent that, we are going to constrain the %0 register class here. 4152 // 4153 // <rdar://problem/11522048> 4154 // 4155 if (MI.isFullCopy()) { 4156 Register DstReg = MI.getOperand(0).getReg(); 4157 Register SrcReg = MI.getOperand(1).getReg(); 4158 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 4159 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4160 return nullptr; 4161 } 4162 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 4163 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4164 return nullptr; 4165 } 4166 } 4167 4168 // Handle the case where a copy is being spilled or filled but the source 4169 // and destination register class don't match. For example: 4170 // 4171 // %0 = COPY %xzr; GPR64common:%0 4172 // 4173 // In this case we can still safely fold away the COPY and generate the 4174 // following spill code: 4175 // 4176 // STRXui %xzr, %stack.0 4177 // 4178 // This also eliminates spilled cross register class COPYs (e.g. between x and 4179 // d regs) of the same size. For example: 4180 // 4181 // %0 = COPY %1; GPR64:%0, FPR64:%1 4182 // 4183 // will be filled as 4184 // 4185 // LDRDui %0, fi<#0> 4186 // 4187 // instead of 4188 // 4189 // LDRXui %Temp, fi<#0> 4190 // %0 = FMOV %Temp 4191 // 4192 if (MI.isCopy() && Ops.size() == 1 && 4193 // Make sure we're only folding the explicit COPY defs/uses. 4194 (Ops[0] == 0 || Ops[0] == 1)) { 4195 bool IsSpill = Ops[0] == 0; 4196 bool IsFill = !IsSpill; 4197 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4198 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4199 MachineBasicBlock &MBB = *MI.getParent(); 4200 const MachineOperand &DstMO = MI.getOperand(0); 4201 const MachineOperand &SrcMO = MI.getOperand(1); 4202 Register DstReg = DstMO.getReg(); 4203 Register SrcReg = SrcMO.getReg(); 4204 // This is slightly expensive to compute for physical regs since 4205 // getMinimalPhysRegClass is slow. 4206 auto getRegClass = [&](unsigned Reg) { 4207 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4208 : TRI.getMinimalPhysRegClass(Reg); 4209 }; 4210 4211 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4212 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4213 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4214 "Mismatched register size in non subreg COPY"); 4215 if (IsSpill) 4216 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4217 getRegClass(SrcReg), &TRI); 4218 else 4219 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4220 getRegClass(DstReg), &TRI); 4221 return &*--InsertPt; 4222 } 4223 4224 // Handle cases like spilling def of: 4225 // 4226 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4227 // 4228 // where the physical register source can be widened and stored to the full 4229 // virtual reg destination stack slot, in this case producing: 4230 // 4231 // STRXui %xzr, %stack.0 4232 // 4233 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 4234 assert(SrcMO.getSubReg() == 0 && 4235 "Unexpected subreg on physical register"); 4236 const TargetRegisterClass *SpillRC; 4237 unsigned SpillSubreg; 4238 switch (DstMO.getSubReg()) { 4239 default: 4240 SpillRC = nullptr; 4241 break; 4242 case AArch64::sub_32: 4243 case AArch64::ssub: 4244 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4245 SpillRC = &AArch64::GPR64RegClass; 4246 SpillSubreg = AArch64::sub_32; 4247 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4248 SpillRC = &AArch64::FPR64RegClass; 4249 SpillSubreg = AArch64::ssub; 4250 } else 4251 SpillRC = nullptr; 4252 break; 4253 case AArch64::dsub: 4254 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4255 SpillRC = &AArch64::FPR128RegClass; 4256 SpillSubreg = AArch64::dsub; 4257 } else 4258 SpillRC = nullptr; 4259 break; 4260 } 4261 4262 if (SpillRC) 4263 if (unsigned WidenedSrcReg = 4264 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4265 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4266 FrameIndex, SpillRC, &TRI); 4267 return &*--InsertPt; 4268 } 4269 } 4270 4271 // Handle cases like filling use of: 4272 // 4273 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4274 // 4275 // where we can load the full virtual reg source stack slot, into the subreg 4276 // destination, in this case producing: 4277 // 4278 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4279 // 4280 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4281 const TargetRegisterClass *FillRC; 4282 switch (DstMO.getSubReg()) { 4283 default: 4284 FillRC = nullptr; 4285 break; 4286 case AArch64::sub_32: 4287 FillRC = &AArch64::GPR32RegClass; 4288 break; 4289 case AArch64::ssub: 4290 FillRC = &AArch64::FPR32RegClass; 4291 break; 4292 case AArch64::dsub: 4293 FillRC = &AArch64::FPR64RegClass; 4294 break; 4295 } 4296 4297 if (FillRC) { 4298 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4299 TRI.getRegSizeInBits(*FillRC) && 4300 "Mismatched regclass size on folded subreg COPY"); 4301 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 4302 MachineInstr &LoadMI = *--InsertPt; 4303 MachineOperand &LoadDst = LoadMI.getOperand(0); 4304 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4305 LoadDst.setSubReg(DstMO.getSubReg()); 4306 LoadDst.setIsUndef(); 4307 return &LoadMI; 4308 } 4309 } 4310 } 4311 4312 // Cannot fold. 4313 return nullptr; 4314 } 4315 4316 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4317 StackOffset &SOffset, 4318 bool *OutUseUnscaledOp, 4319 unsigned *OutUnscaledOp, 4320 int64_t *EmittableOffset) { 4321 // Set output values in case of early exit. 4322 if (EmittableOffset) 4323 *EmittableOffset = 0; 4324 if (OutUseUnscaledOp) 4325 *OutUseUnscaledOp = false; 4326 if (OutUnscaledOp) 4327 *OutUnscaledOp = 0; 4328 4329 // Exit early for structured vector spills/fills as they can't take an 4330 // immediate offset. 4331 switch (MI.getOpcode()) { 4332 default: 4333 break; 4334 case AArch64::LD1Twov2d: 4335 case AArch64::LD1Threev2d: 4336 case AArch64::LD1Fourv2d: 4337 case AArch64::LD1Twov1d: 4338 case AArch64::LD1Threev1d: 4339 case AArch64::LD1Fourv1d: 4340 case AArch64::ST1Twov2d: 4341 case AArch64::ST1Threev2d: 4342 case AArch64::ST1Fourv2d: 4343 case AArch64::ST1Twov1d: 4344 case AArch64::ST1Threev1d: 4345 case AArch64::ST1Fourv1d: 4346 case AArch64::IRG: 4347 case AArch64::IRGstack: 4348 case AArch64::STGloop: 4349 case AArch64::STZGloop: 4350 return AArch64FrameOffsetCannotUpdate; 4351 } 4352 4353 // Get the min/max offset and the scale. 4354 TypeSize ScaleValue(0U, false); 4355 unsigned Width; 4356 int64_t MinOff, MaxOff; 4357 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4358 MaxOff)) 4359 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4360 4361 // Construct the complete offset. 4362 bool IsMulVL = ScaleValue.isScalable(); 4363 unsigned Scale = ScaleValue.getKnownMinSize(); 4364 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4365 4366 const MachineOperand &ImmOpnd = 4367 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4368 Offset += ImmOpnd.getImm() * Scale; 4369 4370 // If the offset doesn't match the scale, we rewrite the instruction to 4371 // use the unscaled instruction instead. Likewise, if we have a negative 4372 // offset and there is an unscaled op to use. 4373 Optional<unsigned> UnscaledOp = 4374 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4375 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4376 if (useUnscaledOp && 4377 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4378 MaxOff)) 4379 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4380 4381 Scale = ScaleValue.getKnownMinSize(); 4382 assert(IsMulVL == ScaleValue.isScalable() && 4383 "Unscaled opcode has different value for scalable"); 4384 4385 int64_t Remainder = Offset % Scale; 4386 assert(!(Remainder && useUnscaledOp) && 4387 "Cannot have remainder when using unscaled op"); 4388 4389 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4390 int64_t NewOffset = Offset / Scale; 4391 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4392 Offset = Remainder; 4393 else { 4394 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4395 Offset = Offset - NewOffset * Scale + Remainder; 4396 } 4397 4398 if (EmittableOffset) 4399 *EmittableOffset = NewOffset; 4400 if (OutUseUnscaledOp) 4401 *OutUseUnscaledOp = useUnscaledOp; 4402 if (OutUnscaledOp && UnscaledOp) 4403 *OutUnscaledOp = *UnscaledOp; 4404 4405 if (IsMulVL) 4406 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4407 else 4408 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4409 return AArch64FrameOffsetCanUpdate | 4410 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4411 } 4412 4413 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4414 unsigned FrameReg, StackOffset &Offset, 4415 const AArch64InstrInfo *TII) { 4416 unsigned Opcode = MI.getOpcode(); 4417 unsigned ImmIdx = FrameRegIdx + 1; 4418 4419 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4420 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4421 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4422 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4423 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4424 MI.eraseFromParent(); 4425 Offset = StackOffset(); 4426 return true; 4427 } 4428 4429 int64_t NewOffset; 4430 unsigned UnscaledOp; 4431 bool UseUnscaledOp; 4432 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4433 &UnscaledOp, &NewOffset); 4434 if (Status & AArch64FrameOffsetCanUpdate) { 4435 if (Status & AArch64FrameOffsetIsLegal) 4436 // Replace the FrameIndex with FrameReg. 4437 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4438 if (UseUnscaledOp) 4439 MI.setDesc(TII->get(UnscaledOp)); 4440 4441 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4442 return !Offset; 4443 } 4444 4445 return false; 4446 } 4447 4448 MCInst AArch64InstrInfo::getNop() const { 4449 return MCInstBuilder(AArch64::HINT).addImm(0); 4450 } 4451 4452 // AArch64 supports MachineCombiner. 4453 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4454 4455 // True when Opc sets flag 4456 static bool isCombineInstrSettingFlag(unsigned Opc) { 4457 switch (Opc) { 4458 case AArch64::ADDSWrr: 4459 case AArch64::ADDSWri: 4460 case AArch64::ADDSXrr: 4461 case AArch64::ADDSXri: 4462 case AArch64::SUBSWrr: 4463 case AArch64::SUBSXrr: 4464 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4465 case AArch64::SUBSWri: 4466 case AArch64::SUBSXri: 4467 return true; 4468 default: 4469 break; 4470 } 4471 return false; 4472 } 4473 4474 // 32b Opcodes that can be combined with a MUL 4475 static bool isCombineInstrCandidate32(unsigned Opc) { 4476 switch (Opc) { 4477 case AArch64::ADDWrr: 4478 case AArch64::ADDWri: 4479 case AArch64::SUBWrr: 4480 case AArch64::ADDSWrr: 4481 case AArch64::ADDSWri: 4482 case AArch64::SUBSWrr: 4483 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4484 case AArch64::SUBWri: 4485 case AArch64::SUBSWri: 4486 return true; 4487 default: 4488 break; 4489 } 4490 return false; 4491 } 4492 4493 // 64b Opcodes that can be combined with a MUL 4494 static bool isCombineInstrCandidate64(unsigned Opc) { 4495 switch (Opc) { 4496 case AArch64::ADDXrr: 4497 case AArch64::ADDXri: 4498 case AArch64::SUBXrr: 4499 case AArch64::ADDSXrr: 4500 case AArch64::ADDSXri: 4501 case AArch64::SUBSXrr: 4502 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4503 case AArch64::SUBXri: 4504 case AArch64::SUBSXri: 4505 case AArch64::ADDv8i8: 4506 case AArch64::ADDv16i8: 4507 case AArch64::ADDv4i16: 4508 case AArch64::ADDv8i16: 4509 case AArch64::ADDv2i32: 4510 case AArch64::ADDv4i32: 4511 case AArch64::SUBv8i8: 4512 case AArch64::SUBv16i8: 4513 case AArch64::SUBv4i16: 4514 case AArch64::SUBv8i16: 4515 case AArch64::SUBv2i32: 4516 case AArch64::SUBv4i32: 4517 return true; 4518 default: 4519 break; 4520 } 4521 return false; 4522 } 4523 4524 // FP Opcodes that can be combined with a FMUL. 4525 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4526 switch (Inst.getOpcode()) { 4527 default: 4528 break; 4529 case AArch64::FADDHrr: 4530 case AArch64::FADDSrr: 4531 case AArch64::FADDDrr: 4532 case AArch64::FADDv4f16: 4533 case AArch64::FADDv8f16: 4534 case AArch64::FADDv2f32: 4535 case AArch64::FADDv2f64: 4536 case AArch64::FADDv4f32: 4537 case AArch64::FSUBHrr: 4538 case AArch64::FSUBSrr: 4539 case AArch64::FSUBDrr: 4540 case AArch64::FSUBv4f16: 4541 case AArch64::FSUBv8f16: 4542 case AArch64::FSUBv2f32: 4543 case AArch64::FSUBv2f64: 4544 case AArch64::FSUBv4f32: 4545 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4546 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4547 // the target options or if FADD/FSUB has the contract fast-math flag. 4548 return Options.UnsafeFPMath || 4549 Options.AllowFPOpFusion == FPOpFusion::Fast || 4550 Inst.getFlag(MachineInstr::FmContract); 4551 return true; 4552 } 4553 return false; 4554 } 4555 4556 // Opcodes that can be combined with a MUL 4557 static bool isCombineInstrCandidate(unsigned Opc) { 4558 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4559 } 4560 4561 // 4562 // Utility routine that checks if \param MO is defined by an 4563 // \param CombineOpc instruction in the basic block \param MBB 4564 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4565 unsigned CombineOpc, unsigned ZeroReg = 0, 4566 bool CheckZeroReg = false) { 4567 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4568 MachineInstr *MI = nullptr; 4569 4570 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 4571 MI = MRI.getUniqueVRegDef(MO.getReg()); 4572 // And it needs to be in the trace (otherwise, it won't have a depth). 4573 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4574 return false; 4575 // Must only used by the user we combine with. 4576 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4577 return false; 4578 4579 if (CheckZeroReg) { 4580 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4581 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4582 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4583 // The third input reg must be zero. 4584 if (MI->getOperand(3).getReg() != ZeroReg) 4585 return false; 4586 } 4587 4588 return true; 4589 } 4590 4591 // 4592 // Is \param MO defined by an integer multiply and can be combined? 4593 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4594 unsigned MulOpc, unsigned ZeroReg) { 4595 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4596 } 4597 4598 // 4599 // Is \param MO defined by a floating-point multiply and can be combined? 4600 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4601 unsigned MulOpc) { 4602 return canCombine(MBB, MO, MulOpc); 4603 } 4604 4605 // TODO: There are many more machine instruction opcodes to match: 4606 // 1. Other data types (integer, vectors) 4607 // 2. Other math / logic operations (xor, or) 4608 // 3. Other forms of the same operation (intrinsics and other variants) 4609 bool AArch64InstrInfo::isAssociativeAndCommutative( 4610 const MachineInstr &Inst) const { 4611 switch (Inst.getOpcode()) { 4612 case AArch64::FADDDrr: 4613 case AArch64::FADDSrr: 4614 case AArch64::FADDv2f32: 4615 case AArch64::FADDv2f64: 4616 case AArch64::FADDv4f32: 4617 case AArch64::FMULDrr: 4618 case AArch64::FMULSrr: 4619 case AArch64::FMULX32: 4620 case AArch64::FMULX64: 4621 case AArch64::FMULXv2f32: 4622 case AArch64::FMULXv2f64: 4623 case AArch64::FMULXv4f32: 4624 case AArch64::FMULv2f32: 4625 case AArch64::FMULv2f64: 4626 case AArch64::FMULv4f32: 4627 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 4628 default: 4629 return false; 4630 } 4631 } 4632 4633 /// Find instructions that can be turned into madd. 4634 static bool getMaddPatterns(MachineInstr &Root, 4635 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4636 unsigned Opc = Root.getOpcode(); 4637 MachineBasicBlock &MBB = *Root.getParent(); 4638 bool Found = false; 4639 4640 if (!isCombineInstrCandidate(Opc)) 4641 return false; 4642 if (isCombineInstrSettingFlag(Opc)) { 4643 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 4644 // When NZCV is live bail out. 4645 if (Cmp_NZCV == -1) 4646 return false; 4647 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 4648 // When opcode can't change bail out. 4649 // CHECKME: do we miss any cases for opcode conversion? 4650 if (NewOpc == Opc) 4651 return false; 4652 Opc = NewOpc; 4653 } 4654 4655 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 4656 MachineCombinerPattern Pattern) { 4657 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 4658 Patterns.push_back(Pattern); 4659 Found = true; 4660 } 4661 }; 4662 4663 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 4664 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 4665 Patterns.push_back(Pattern); 4666 Found = true; 4667 } 4668 }; 4669 4670 typedef MachineCombinerPattern MCP; 4671 4672 switch (Opc) { 4673 default: 4674 break; 4675 case AArch64::ADDWrr: 4676 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4677 "ADDWrr does not have register operands"); 4678 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 4679 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 4680 break; 4681 case AArch64::ADDXrr: 4682 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 4683 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 4684 break; 4685 case AArch64::SUBWrr: 4686 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 4687 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 4688 break; 4689 case AArch64::SUBXrr: 4690 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 4691 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 4692 break; 4693 case AArch64::ADDWri: 4694 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 4695 break; 4696 case AArch64::ADDXri: 4697 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 4698 break; 4699 case AArch64::SUBWri: 4700 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 4701 break; 4702 case AArch64::SUBXri: 4703 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 4704 break; 4705 case AArch64::ADDv8i8: 4706 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 4707 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 4708 break; 4709 case AArch64::ADDv16i8: 4710 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 4711 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 4712 break; 4713 case AArch64::ADDv4i16: 4714 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 4715 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 4716 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 4717 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 4718 break; 4719 case AArch64::ADDv8i16: 4720 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 4721 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 4722 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 4723 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 4724 break; 4725 case AArch64::ADDv2i32: 4726 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 4727 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 4728 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 4729 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 4730 break; 4731 case AArch64::ADDv4i32: 4732 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 4733 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 4734 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 4735 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 4736 break; 4737 case AArch64::SUBv8i8: 4738 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 4739 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 4740 break; 4741 case AArch64::SUBv16i8: 4742 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 4743 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 4744 break; 4745 case AArch64::SUBv4i16: 4746 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 4747 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 4748 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 4749 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 4750 break; 4751 case AArch64::SUBv8i16: 4752 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 4753 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 4754 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 4755 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 4756 break; 4757 case AArch64::SUBv2i32: 4758 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 4759 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 4760 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 4761 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 4762 break; 4763 case AArch64::SUBv4i32: 4764 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 4765 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 4766 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 4767 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 4768 break; 4769 } 4770 return Found; 4771 } 4772 /// Floating-Point Support 4773 4774 /// Find instructions that can be turned into madd. 4775 static bool getFMAPatterns(MachineInstr &Root, 4776 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4777 4778 if (!isCombineInstrCandidateFP(Root)) 4779 return false; 4780 4781 MachineBasicBlock &MBB = *Root.getParent(); 4782 bool Found = false; 4783 4784 auto Match = [&](int Opcode, int Operand, 4785 MachineCombinerPattern Pattern) -> bool { 4786 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 4787 Patterns.push_back(Pattern); 4788 return true; 4789 } 4790 return false; 4791 }; 4792 4793 typedef MachineCombinerPattern MCP; 4794 4795 switch (Root.getOpcode()) { 4796 default: 4797 assert(false && "Unsupported FP instruction in combiner\n"); 4798 break; 4799 case AArch64::FADDHrr: 4800 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4801 "FADDHrr does not have register operands"); 4802 4803 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 4804 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 4805 break; 4806 case AArch64::FADDSrr: 4807 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4808 "FADDSrr does not have register operands"); 4809 4810 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 4811 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 4812 4813 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 4814 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 4815 break; 4816 case AArch64::FADDDrr: 4817 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 4818 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 4819 4820 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 4821 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 4822 break; 4823 case AArch64::FADDv4f16: 4824 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 4825 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 4826 4827 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 4828 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 4829 break; 4830 case AArch64::FADDv8f16: 4831 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 4832 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 4833 4834 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 4835 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 4836 break; 4837 case AArch64::FADDv2f32: 4838 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 4839 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 4840 4841 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 4842 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 4843 break; 4844 case AArch64::FADDv2f64: 4845 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 4846 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 4847 4848 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 4849 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 4850 break; 4851 case AArch64::FADDv4f32: 4852 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 4853 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 4854 4855 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 4856 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 4857 break; 4858 case AArch64::FSUBHrr: 4859 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 4860 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 4861 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 4862 break; 4863 case AArch64::FSUBSrr: 4864 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 4865 4866 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 4867 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 4868 4869 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 4870 break; 4871 case AArch64::FSUBDrr: 4872 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 4873 4874 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 4875 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 4876 4877 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 4878 break; 4879 case AArch64::FSUBv4f16: 4880 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 4881 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 4882 4883 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 4884 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 4885 break; 4886 case AArch64::FSUBv8f16: 4887 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 4888 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 4889 4890 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 4891 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 4892 break; 4893 case AArch64::FSUBv2f32: 4894 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 4895 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4896 4897 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4898 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4899 break; 4900 case AArch64::FSUBv2f64: 4901 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4902 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4903 4904 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4905 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4906 break; 4907 case AArch64::FSUBv4f32: 4908 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4909 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4910 4911 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4912 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4913 break; 4914 } 4915 return Found; 4916 } 4917 4918 /// Return true when a code sequence can improve throughput. It 4919 /// should be called only for instructions in loops. 4920 /// \param Pattern - combiner pattern 4921 bool AArch64InstrInfo::isThroughputPattern( 4922 MachineCombinerPattern Pattern) const { 4923 switch (Pattern) { 4924 default: 4925 break; 4926 case MachineCombinerPattern::FMULADDH_OP1: 4927 case MachineCombinerPattern::FMULADDH_OP2: 4928 case MachineCombinerPattern::FMULSUBH_OP1: 4929 case MachineCombinerPattern::FMULSUBH_OP2: 4930 case MachineCombinerPattern::FMULADDS_OP1: 4931 case MachineCombinerPattern::FMULADDS_OP2: 4932 case MachineCombinerPattern::FMULSUBS_OP1: 4933 case MachineCombinerPattern::FMULSUBS_OP2: 4934 case MachineCombinerPattern::FMULADDD_OP1: 4935 case MachineCombinerPattern::FMULADDD_OP2: 4936 case MachineCombinerPattern::FMULSUBD_OP1: 4937 case MachineCombinerPattern::FMULSUBD_OP2: 4938 case MachineCombinerPattern::FNMULSUBH_OP1: 4939 case MachineCombinerPattern::FNMULSUBS_OP1: 4940 case MachineCombinerPattern::FNMULSUBD_OP1: 4941 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4942 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4943 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4944 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4945 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4946 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4947 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4948 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4949 case MachineCombinerPattern::FMLAv4f16_OP2: 4950 case MachineCombinerPattern::FMLAv4f16_OP1: 4951 case MachineCombinerPattern::FMLAv8f16_OP1: 4952 case MachineCombinerPattern::FMLAv8f16_OP2: 4953 case MachineCombinerPattern::FMLAv2f32_OP2: 4954 case MachineCombinerPattern::FMLAv2f32_OP1: 4955 case MachineCombinerPattern::FMLAv2f64_OP1: 4956 case MachineCombinerPattern::FMLAv2f64_OP2: 4957 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4958 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4959 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4960 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4961 case MachineCombinerPattern::FMLAv4f32_OP1: 4962 case MachineCombinerPattern::FMLAv4f32_OP2: 4963 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4964 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4965 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4966 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4967 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4968 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4969 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4970 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4971 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4972 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4973 case MachineCombinerPattern::FMLSv4f16_OP1: 4974 case MachineCombinerPattern::FMLSv4f16_OP2: 4975 case MachineCombinerPattern::FMLSv8f16_OP1: 4976 case MachineCombinerPattern::FMLSv8f16_OP2: 4977 case MachineCombinerPattern::FMLSv2f32_OP2: 4978 case MachineCombinerPattern::FMLSv2f64_OP2: 4979 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4980 case MachineCombinerPattern::FMLSv4f32_OP2: 4981 case MachineCombinerPattern::MULADDv8i8_OP1: 4982 case MachineCombinerPattern::MULADDv8i8_OP2: 4983 case MachineCombinerPattern::MULADDv16i8_OP1: 4984 case MachineCombinerPattern::MULADDv16i8_OP2: 4985 case MachineCombinerPattern::MULADDv4i16_OP1: 4986 case MachineCombinerPattern::MULADDv4i16_OP2: 4987 case MachineCombinerPattern::MULADDv8i16_OP1: 4988 case MachineCombinerPattern::MULADDv8i16_OP2: 4989 case MachineCombinerPattern::MULADDv2i32_OP1: 4990 case MachineCombinerPattern::MULADDv2i32_OP2: 4991 case MachineCombinerPattern::MULADDv4i32_OP1: 4992 case MachineCombinerPattern::MULADDv4i32_OP2: 4993 case MachineCombinerPattern::MULSUBv8i8_OP1: 4994 case MachineCombinerPattern::MULSUBv8i8_OP2: 4995 case MachineCombinerPattern::MULSUBv16i8_OP1: 4996 case MachineCombinerPattern::MULSUBv16i8_OP2: 4997 case MachineCombinerPattern::MULSUBv4i16_OP1: 4998 case MachineCombinerPattern::MULSUBv4i16_OP2: 4999 case MachineCombinerPattern::MULSUBv8i16_OP1: 5000 case MachineCombinerPattern::MULSUBv8i16_OP2: 5001 case MachineCombinerPattern::MULSUBv2i32_OP1: 5002 case MachineCombinerPattern::MULSUBv2i32_OP2: 5003 case MachineCombinerPattern::MULSUBv4i32_OP1: 5004 case MachineCombinerPattern::MULSUBv4i32_OP2: 5005 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5006 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5007 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5008 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5009 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5010 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5011 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5012 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5013 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5014 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5015 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5016 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5017 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5018 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5019 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5020 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5021 return true; 5022 } // end switch (Pattern) 5023 return false; 5024 } 5025 /// Return true when there is potentially a faster code sequence for an 5026 /// instruction chain ending in \p Root. All potential patterns are listed in 5027 /// the \p Pattern vector. Pattern should be sorted in priority order since the 5028 /// pattern evaluator stops checking as soon as it finds a faster sequence. 5029 5030 bool AArch64InstrInfo::getMachineCombinerPatterns( 5031 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 5032 bool DoRegPressureReduce) const { 5033 // Integer patterns 5034 if (getMaddPatterns(Root, Patterns)) 5035 return true; 5036 // Floating point patterns 5037 if (getFMAPatterns(Root, Patterns)) 5038 return true; 5039 5040 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 5041 DoRegPressureReduce); 5042 } 5043 5044 enum class FMAInstKind { Default, Indexed, Accumulator }; 5045 /// genFusedMultiply - Generate fused multiply instructions. 5046 /// This function supports both integer and floating point instructions. 5047 /// A typical example: 5048 /// F|MUL I=A,B,0 5049 /// F|ADD R,I,C 5050 /// ==> F|MADD R,A,B,C 5051 /// \param MF Containing MachineFunction 5052 /// \param MRI Register information 5053 /// \param TII Target information 5054 /// \param Root is the F|ADD instruction 5055 /// \param [out] InsInstrs is a vector of machine instructions and will 5056 /// contain the generated madd instruction 5057 /// \param IdxMulOpd is index of operand in Root that is the result of 5058 /// the F|MUL. In the example above IdxMulOpd is 1. 5059 /// \param MaddOpc the opcode fo the f|madd instruction 5060 /// \param RC Register class of operands 5061 /// \param kind of fma instruction (addressing mode) to be generated 5062 /// \param ReplacedAddend is the result register from the instruction 5063 /// replacing the non-combined operand, if any. 5064 static MachineInstr * 5065 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5066 const TargetInstrInfo *TII, MachineInstr &Root, 5067 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5068 unsigned MaddOpc, const TargetRegisterClass *RC, 5069 FMAInstKind kind = FMAInstKind::Default, 5070 const Register *ReplacedAddend = nullptr) { 5071 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5072 5073 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5074 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5075 Register ResultReg = Root.getOperand(0).getReg(); 5076 Register SrcReg0 = MUL->getOperand(1).getReg(); 5077 bool Src0IsKill = MUL->getOperand(1).isKill(); 5078 Register SrcReg1 = MUL->getOperand(2).getReg(); 5079 bool Src1IsKill = MUL->getOperand(2).isKill(); 5080 5081 unsigned SrcReg2; 5082 bool Src2IsKill; 5083 if (ReplacedAddend) { 5084 // If we just generated a new addend, we must be it's only use. 5085 SrcReg2 = *ReplacedAddend; 5086 Src2IsKill = true; 5087 } else { 5088 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5089 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5090 } 5091 5092 if (Register::isVirtualRegister(ResultReg)) 5093 MRI.constrainRegClass(ResultReg, RC); 5094 if (Register::isVirtualRegister(SrcReg0)) 5095 MRI.constrainRegClass(SrcReg0, RC); 5096 if (Register::isVirtualRegister(SrcReg1)) 5097 MRI.constrainRegClass(SrcReg1, RC); 5098 if (Register::isVirtualRegister(SrcReg2)) 5099 MRI.constrainRegClass(SrcReg2, RC); 5100 5101 MachineInstrBuilder MIB; 5102 if (kind == FMAInstKind::Default) 5103 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5104 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5105 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5106 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5107 else if (kind == FMAInstKind::Indexed) 5108 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5109 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5110 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5111 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5112 .addImm(MUL->getOperand(3).getImm()); 5113 else if (kind == FMAInstKind::Accumulator) 5114 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5115 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5116 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5117 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5118 else 5119 assert(false && "Invalid FMA instruction kind \n"); 5120 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5121 InsInstrs.push_back(MIB); 5122 return MUL; 5123 } 5124 5125 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5126 /// instructions. 5127 /// 5128 /// \see genFusedMultiply 5129 static MachineInstr *genFusedMultiplyAcc( 5130 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5131 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5132 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5133 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5134 FMAInstKind::Accumulator); 5135 } 5136 5137 /// genNeg - Helper to generate an intermediate negation of the second operand 5138 /// of Root 5139 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5140 const TargetInstrInfo *TII, MachineInstr &Root, 5141 SmallVectorImpl<MachineInstr *> &InsInstrs, 5142 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5143 unsigned MnegOpc, const TargetRegisterClass *RC) { 5144 Register NewVR = MRI.createVirtualRegister(RC); 5145 MachineInstrBuilder MIB = 5146 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 5147 .add(Root.getOperand(2)); 5148 InsInstrs.push_back(MIB); 5149 5150 assert(InstrIdxForVirtReg.empty()); 5151 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5152 5153 return NewVR; 5154 } 5155 5156 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5157 /// instructions with an additional negation of the accumulator 5158 static MachineInstr *genFusedMultiplyAccNeg( 5159 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5160 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5161 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5162 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5163 assert(IdxMulOpd == 1); 5164 5165 Register NewVR = 5166 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5167 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5168 FMAInstKind::Accumulator, &NewVR); 5169 } 5170 5171 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5172 /// instructions. 5173 /// 5174 /// \see genFusedMultiply 5175 static MachineInstr *genFusedMultiplyIdx( 5176 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5177 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5178 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5179 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5180 FMAInstKind::Indexed); 5181 } 5182 5183 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5184 /// instructions with an additional negation of the accumulator 5185 static MachineInstr *genFusedMultiplyIdxNeg( 5186 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5187 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5188 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5189 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5190 assert(IdxMulOpd == 1); 5191 5192 Register NewVR = 5193 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5194 5195 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5196 FMAInstKind::Indexed, &NewVR); 5197 } 5198 5199 /// genMaddR - Generate madd instruction and combine mul and add using 5200 /// an extra virtual register 5201 /// Example - an ADD intermediate needs to be stored in a register: 5202 /// MUL I=A,B,0 5203 /// ADD R,I,Imm 5204 /// ==> ORR V, ZR, Imm 5205 /// ==> MADD R,A,B,V 5206 /// \param MF Containing MachineFunction 5207 /// \param MRI Register information 5208 /// \param TII Target information 5209 /// \param Root is the ADD instruction 5210 /// \param [out] InsInstrs is a vector of machine instructions and will 5211 /// contain the generated madd instruction 5212 /// \param IdxMulOpd is index of operand in Root that is the result of 5213 /// the MUL. In the example above IdxMulOpd is 1. 5214 /// \param MaddOpc the opcode fo the madd instruction 5215 /// \param VR is a virtual register that holds the value of an ADD operand 5216 /// (V in the example above). 5217 /// \param RC Register class of operands 5218 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5219 const TargetInstrInfo *TII, MachineInstr &Root, 5220 SmallVectorImpl<MachineInstr *> &InsInstrs, 5221 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5222 const TargetRegisterClass *RC) { 5223 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5224 5225 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5226 Register ResultReg = Root.getOperand(0).getReg(); 5227 Register SrcReg0 = MUL->getOperand(1).getReg(); 5228 bool Src0IsKill = MUL->getOperand(1).isKill(); 5229 Register SrcReg1 = MUL->getOperand(2).getReg(); 5230 bool Src1IsKill = MUL->getOperand(2).isKill(); 5231 5232 if (Register::isVirtualRegister(ResultReg)) 5233 MRI.constrainRegClass(ResultReg, RC); 5234 if (Register::isVirtualRegister(SrcReg0)) 5235 MRI.constrainRegClass(SrcReg0, RC); 5236 if (Register::isVirtualRegister(SrcReg1)) 5237 MRI.constrainRegClass(SrcReg1, RC); 5238 if (Register::isVirtualRegister(VR)) 5239 MRI.constrainRegClass(VR, RC); 5240 5241 MachineInstrBuilder MIB = 5242 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5243 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5244 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5245 .addReg(VR); 5246 // Insert the MADD 5247 InsInstrs.push_back(MIB); 5248 return MUL; 5249 } 5250 5251 /// When getMachineCombinerPatterns() finds potential patterns, 5252 /// this function generates the instructions that could replace the 5253 /// original code sequence 5254 void AArch64InstrInfo::genAlternativeCodeSequence( 5255 MachineInstr &Root, MachineCombinerPattern Pattern, 5256 SmallVectorImpl<MachineInstr *> &InsInstrs, 5257 SmallVectorImpl<MachineInstr *> &DelInstrs, 5258 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5259 MachineBasicBlock &MBB = *Root.getParent(); 5260 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5261 MachineFunction &MF = *MBB.getParent(); 5262 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5263 5264 MachineInstr *MUL = nullptr; 5265 const TargetRegisterClass *RC; 5266 unsigned Opc; 5267 switch (Pattern) { 5268 default: 5269 // Reassociate instructions. 5270 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5271 DelInstrs, InstrIdxForVirtReg); 5272 return; 5273 case MachineCombinerPattern::MULADDW_OP1: 5274 case MachineCombinerPattern::MULADDX_OP1: 5275 // MUL I=A,B,0 5276 // ADD R,I,C 5277 // ==> MADD R,A,B,C 5278 // --- Create(MADD); 5279 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 5280 Opc = AArch64::MADDWrrr; 5281 RC = &AArch64::GPR32RegClass; 5282 } else { 5283 Opc = AArch64::MADDXrrr; 5284 RC = &AArch64::GPR64RegClass; 5285 } 5286 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5287 break; 5288 case MachineCombinerPattern::MULADDW_OP2: 5289 case MachineCombinerPattern::MULADDX_OP2: 5290 // MUL I=A,B,0 5291 // ADD R,C,I 5292 // ==> MADD R,A,B,C 5293 // --- Create(MADD); 5294 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 5295 Opc = AArch64::MADDWrrr; 5296 RC = &AArch64::GPR32RegClass; 5297 } else { 5298 Opc = AArch64::MADDXrrr; 5299 RC = &AArch64::GPR64RegClass; 5300 } 5301 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5302 break; 5303 case MachineCombinerPattern::MULADDWI_OP1: 5304 case MachineCombinerPattern::MULADDXI_OP1: { 5305 // MUL I=A,B,0 5306 // ADD R,I,Imm 5307 // ==> ORR V, ZR, Imm 5308 // ==> MADD R,A,B,V 5309 // --- Create(MADD); 5310 const TargetRegisterClass *OrrRC; 5311 unsigned BitSize, OrrOpc, ZeroReg; 5312 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 5313 OrrOpc = AArch64::ORRWri; 5314 OrrRC = &AArch64::GPR32spRegClass; 5315 BitSize = 32; 5316 ZeroReg = AArch64::WZR; 5317 Opc = AArch64::MADDWrrr; 5318 RC = &AArch64::GPR32RegClass; 5319 } else { 5320 OrrOpc = AArch64::ORRXri; 5321 OrrRC = &AArch64::GPR64spRegClass; 5322 BitSize = 64; 5323 ZeroReg = AArch64::XZR; 5324 Opc = AArch64::MADDXrrr; 5325 RC = &AArch64::GPR64RegClass; 5326 } 5327 Register NewVR = MRI.createVirtualRegister(OrrRC); 5328 uint64_t Imm = Root.getOperand(2).getImm(); 5329 5330 if (Root.getOperand(3).isImm()) { 5331 unsigned Val = Root.getOperand(3).getImm(); 5332 Imm = Imm << Val; 5333 } 5334 uint64_t UImm = SignExtend64(Imm, BitSize); 5335 uint64_t Encoding; 5336 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5337 MachineInstrBuilder MIB1 = 5338 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5339 .addReg(ZeroReg) 5340 .addImm(Encoding); 5341 InsInstrs.push_back(MIB1); 5342 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5343 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5344 } 5345 break; 5346 } 5347 case MachineCombinerPattern::MULSUBW_OP1: 5348 case MachineCombinerPattern::MULSUBX_OP1: { 5349 // MUL I=A,B,0 5350 // SUB R,I, C 5351 // ==> SUB V, 0, C 5352 // ==> MADD R,A,B,V // = -C + A*B 5353 // --- Create(MADD); 5354 const TargetRegisterClass *SubRC; 5355 unsigned SubOpc, ZeroReg; 5356 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 5357 SubOpc = AArch64::SUBWrr; 5358 SubRC = &AArch64::GPR32spRegClass; 5359 ZeroReg = AArch64::WZR; 5360 Opc = AArch64::MADDWrrr; 5361 RC = &AArch64::GPR32RegClass; 5362 } else { 5363 SubOpc = AArch64::SUBXrr; 5364 SubRC = &AArch64::GPR64spRegClass; 5365 ZeroReg = AArch64::XZR; 5366 Opc = AArch64::MADDXrrr; 5367 RC = &AArch64::GPR64RegClass; 5368 } 5369 Register NewVR = MRI.createVirtualRegister(SubRC); 5370 // SUB NewVR, 0, C 5371 MachineInstrBuilder MIB1 = 5372 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 5373 .addReg(ZeroReg) 5374 .add(Root.getOperand(2)); 5375 InsInstrs.push_back(MIB1); 5376 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5377 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5378 break; 5379 } 5380 case MachineCombinerPattern::MULSUBW_OP2: 5381 case MachineCombinerPattern::MULSUBX_OP2: 5382 // MUL I=A,B,0 5383 // SUB R,C,I 5384 // ==> MSUB R,A,B,C (computes C - A*B) 5385 // --- Create(MSUB); 5386 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 5387 Opc = AArch64::MSUBWrrr; 5388 RC = &AArch64::GPR32RegClass; 5389 } else { 5390 Opc = AArch64::MSUBXrrr; 5391 RC = &AArch64::GPR64RegClass; 5392 } 5393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5394 break; 5395 case MachineCombinerPattern::MULSUBWI_OP1: 5396 case MachineCombinerPattern::MULSUBXI_OP1: { 5397 // MUL I=A,B,0 5398 // SUB R,I, Imm 5399 // ==> ORR V, ZR, -Imm 5400 // ==> MADD R,A,B,V // = -Imm + A*B 5401 // --- Create(MADD); 5402 const TargetRegisterClass *OrrRC; 5403 unsigned BitSize, OrrOpc, ZeroReg; 5404 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 5405 OrrOpc = AArch64::ORRWri; 5406 OrrRC = &AArch64::GPR32spRegClass; 5407 BitSize = 32; 5408 ZeroReg = AArch64::WZR; 5409 Opc = AArch64::MADDWrrr; 5410 RC = &AArch64::GPR32RegClass; 5411 } else { 5412 OrrOpc = AArch64::ORRXri; 5413 OrrRC = &AArch64::GPR64spRegClass; 5414 BitSize = 64; 5415 ZeroReg = AArch64::XZR; 5416 Opc = AArch64::MADDXrrr; 5417 RC = &AArch64::GPR64RegClass; 5418 } 5419 Register NewVR = MRI.createVirtualRegister(OrrRC); 5420 uint64_t Imm = Root.getOperand(2).getImm(); 5421 if (Root.getOperand(3).isImm()) { 5422 unsigned Val = Root.getOperand(3).getImm(); 5423 Imm = Imm << Val; 5424 } 5425 uint64_t UImm = SignExtend64(-Imm, BitSize); 5426 uint64_t Encoding; 5427 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5428 MachineInstrBuilder MIB1 = 5429 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5430 .addReg(ZeroReg) 5431 .addImm(Encoding); 5432 InsInstrs.push_back(MIB1); 5433 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5434 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5435 } 5436 break; 5437 } 5438 5439 case MachineCombinerPattern::MULADDv8i8_OP1: 5440 Opc = AArch64::MLAv8i8; 5441 RC = &AArch64::FPR64RegClass; 5442 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5443 break; 5444 case MachineCombinerPattern::MULADDv8i8_OP2: 5445 Opc = AArch64::MLAv8i8; 5446 RC = &AArch64::FPR64RegClass; 5447 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5448 break; 5449 case MachineCombinerPattern::MULADDv16i8_OP1: 5450 Opc = AArch64::MLAv16i8; 5451 RC = &AArch64::FPR128RegClass; 5452 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5453 break; 5454 case MachineCombinerPattern::MULADDv16i8_OP2: 5455 Opc = AArch64::MLAv16i8; 5456 RC = &AArch64::FPR128RegClass; 5457 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5458 break; 5459 case MachineCombinerPattern::MULADDv4i16_OP1: 5460 Opc = AArch64::MLAv4i16; 5461 RC = &AArch64::FPR64RegClass; 5462 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5463 break; 5464 case MachineCombinerPattern::MULADDv4i16_OP2: 5465 Opc = AArch64::MLAv4i16; 5466 RC = &AArch64::FPR64RegClass; 5467 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5468 break; 5469 case MachineCombinerPattern::MULADDv8i16_OP1: 5470 Opc = AArch64::MLAv8i16; 5471 RC = &AArch64::FPR128RegClass; 5472 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5473 break; 5474 case MachineCombinerPattern::MULADDv8i16_OP2: 5475 Opc = AArch64::MLAv8i16; 5476 RC = &AArch64::FPR128RegClass; 5477 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5478 break; 5479 case MachineCombinerPattern::MULADDv2i32_OP1: 5480 Opc = AArch64::MLAv2i32; 5481 RC = &AArch64::FPR64RegClass; 5482 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5483 break; 5484 case MachineCombinerPattern::MULADDv2i32_OP2: 5485 Opc = AArch64::MLAv2i32; 5486 RC = &AArch64::FPR64RegClass; 5487 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5488 break; 5489 case MachineCombinerPattern::MULADDv4i32_OP1: 5490 Opc = AArch64::MLAv4i32; 5491 RC = &AArch64::FPR128RegClass; 5492 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5493 break; 5494 case MachineCombinerPattern::MULADDv4i32_OP2: 5495 Opc = AArch64::MLAv4i32; 5496 RC = &AArch64::FPR128RegClass; 5497 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5498 break; 5499 5500 case MachineCombinerPattern::MULSUBv8i8_OP1: 5501 Opc = AArch64::MLAv8i8; 5502 RC = &AArch64::FPR64RegClass; 5503 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5504 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 5505 RC); 5506 break; 5507 case MachineCombinerPattern::MULSUBv8i8_OP2: 5508 Opc = AArch64::MLSv8i8; 5509 RC = &AArch64::FPR64RegClass; 5510 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5511 break; 5512 case MachineCombinerPattern::MULSUBv16i8_OP1: 5513 Opc = AArch64::MLAv16i8; 5514 RC = &AArch64::FPR128RegClass; 5515 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5516 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 5517 RC); 5518 break; 5519 case MachineCombinerPattern::MULSUBv16i8_OP2: 5520 Opc = AArch64::MLSv16i8; 5521 RC = &AArch64::FPR128RegClass; 5522 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5523 break; 5524 case MachineCombinerPattern::MULSUBv4i16_OP1: 5525 Opc = AArch64::MLAv4i16; 5526 RC = &AArch64::FPR64RegClass; 5527 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5528 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5529 RC); 5530 break; 5531 case MachineCombinerPattern::MULSUBv4i16_OP2: 5532 Opc = AArch64::MLSv4i16; 5533 RC = &AArch64::FPR64RegClass; 5534 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5535 break; 5536 case MachineCombinerPattern::MULSUBv8i16_OP1: 5537 Opc = AArch64::MLAv8i16; 5538 RC = &AArch64::FPR128RegClass; 5539 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5540 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5541 RC); 5542 break; 5543 case MachineCombinerPattern::MULSUBv8i16_OP2: 5544 Opc = AArch64::MLSv8i16; 5545 RC = &AArch64::FPR128RegClass; 5546 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5547 break; 5548 case MachineCombinerPattern::MULSUBv2i32_OP1: 5549 Opc = AArch64::MLAv2i32; 5550 RC = &AArch64::FPR64RegClass; 5551 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5552 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5553 RC); 5554 break; 5555 case MachineCombinerPattern::MULSUBv2i32_OP2: 5556 Opc = AArch64::MLSv2i32; 5557 RC = &AArch64::FPR64RegClass; 5558 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5559 break; 5560 case MachineCombinerPattern::MULSUBv4i32_OP1: 5561 Opc = AArch64::MLAv4i32; 5562 RC = &AArch64::FPR128RegClass; 5563 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5564 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5565 RC); 5566 break; 5567 case MachineCombinerPattern::MULSUBv4i32_OP2: 5568 Opc = AArch64::MLSv4i32; 5569 RC = &AArch64::FPR128RegClass; 5570 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5571 break; 5572 5573 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5574 Opc = AArch64::MLAv4i16_indexed; 5575 RC = &AArch64::FPR64RegClass; 5576 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5577 break; 5578 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5579 Opc = AArch64::MLAv4i16_indexed; 5580 RC = &AArch64::FPR64RegClass; 5581 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5582 break; 5583 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5584 Opc = AArch64::MLAv8i16_indexed; 5585 RC = &AArch64::FPR128RegClass; 5586 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5587 break; 5588 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5589 Opc = AArch64::MLAv8i16_indexed; 5590 RC = &AArch64::FPR128RegClass; 5591 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5592 break; 5593 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5594 Opc = AArch64::MLAv2i32_indexed; 5595 RC = &AArch64::FPR64RegClass; 5596 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5597 break; 5598 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5599 Opc = AArch64::MLAv2i32_indexed; 5600 RC = &AArch64::FPR64RegClass; 5601 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5602 break; 5603 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5604 Opc = AArch64::MLAv4i32_indexed; 5605 RC = &AArch64::FPR128RegClass; 5606 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5607 break; 5608 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5609 Opc = AArch64::MLAv4i32_indexed; 5610 RC = &AArch64::FPR128RegClass; 5611 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5612 break; 5613 5614 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5615 Opc = AArch64::MLAv4i16_indexed; 5616 RC = &AArch64::FPR64RegClass; 5617 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5618 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5619 RC); 5620 break; 5621 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5622 Opc = AArch64::MLSv4i16_indexed; 5623 RC = &AArch64::FPR64RegClass; 5624 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5625 break; 5626 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5627 Opc = AArch64::MLAv8i16_indexed; 5628 RC = &AArch64::FPR128RegClass; 5629 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5630 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5631 RC); 5632 break; 5633 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5634 Opc = AArch64::MLSv8i16_indexed; 5635 RC = &AArch64::FPR128RegClass; 5636 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5637 break; 5638 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5639 Opc = AArch64::MLAv2i32_indexed; 5640 RC = &AArch64::FPR64RegClass; 5641 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5642 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5643 RC); 5644 break; 5645 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5646 Opc = AArch64::MLSv2i32_indexed; 5647 RC = &AArch64::FPR64RegClass; 5648 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5649 break; 5650 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5651 Opc = AArch64::MLAv4i32_indexed; 5652 RC = &AArch64::FPR128RegClass; 5653 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5654 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5655 RC); 5656 break; 5657 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5658 Opc = AArch64::MLSv4i32_indexed; 5659 RC = &AArch64::FPR128RegClass; 5660 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5661 break; 5662 5663 // Floating Point Support 5664 case MachineCombinerPattern::FMULADDH_OP1: 5665 Opc = AArch64::FMADDHrrr; 5666 RC = &AArch64::FPR16RegClass; 5667 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5668 break; 5669 case MachineCombinerPattern::FMULADDS_OP1: 5670 Opc = AArch64::FMADDSrrr; 5671 RC = &AArch64::FPR32RegClass; 5672 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5673 break; 5674 case MachineCombinerPattern::FMULADDD_OP1: 5675 Opc = AArch64::FMADDDrrr; 5676 RC = &AArch64::FPR64RegClass; 5677 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5678 break; 5679 5680 case MachineCombinerPattern::FMULADDH_OP2: 5681 Opc = AArch64::FMADDHrrr; 5682 RC = &AArch64::FPR16RegClass; 5683 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5684 break; 5685 case MachineCombinerPattern::FMULADDS_OP2: 5686 Opc = AArch64::FMADDSrrr; 5687 RC = &AArch64::FPR32RegClass; 5688 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5689 break; 5690 case MachineCombinerPattern::FMULADDD_OP2: 5691 Opc = AArch64::FMADDDrrr; 5692 RC = &AArch64::FPR64RegClass; 5693 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5694 break; 5695 5696 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5697 Opc = AArch64::FMLAv1i32_indexed; 5698 RC = &AArch64::FPR32RegClass; 5699 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5700 FMAInstKind::Indexed); 5701 break; 5702 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5703 Opc = AArch64::FMLAv1i32_indexed; 5704 RC = &AArch64::FPR32RegClass; 5705 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5706 FMAInstKind::Indexed); 5707 break; 5708 5709 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5710 Opc = AArch64::FMLAv1i64_indexed; 5711 RC = &AArch64::FPR64RegClass; 5712 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5713 FMAInstKind::Indexed); 5714 break; 5715 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5716 Opc = AArch64::FMLAv1i64_indexed; 5717 RC = &AArch64::FPR64RegClass; 5718 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5719 FMAInstKind::Indexed); 5720 break; 5721 5722 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5723 RC = &AArch64::FPR64RegClass; 5724 Opc = AArch64::FMLAv4i16_indexed; 5725 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5726 FMAInstKind::Indexed); 5727 break; 5728 case MachineCombinerPattern::FMLAv4f16_OP1: 5729 RC = &AArch64::FPR64RegClass; 5730 Opc = AArch64::FMLAv4f16; 5731 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5732 FMAInstKind::Accumulator); 5733 break; 5734 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5735 RC = &AArch64::FPR64RegClass; 5736 Opc = AArch64::FMLAv4i16_indexed; 5737 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5738 FMAInstKind::Indexed); 5739 break; 5740 case MachineCombinerPattern::FMLAv4f16_OP2: 5741 RC = &AArch64::FPR64RegClass; 5742 Opc = AArch64::FMLAv4f16; 5743 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5744 FMAInstKind::Accumulator); 5745 break; 5746 5747 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5748 case MachineCombinerPattern::FMLAv2f32_OP1: 5749 RC = &AArch64::FPR64RegClass; 5750 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 5751 Opc = AArch64::FMLAv2i32_indexed; 5752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5753 FMAInstKind::Indexed); 5754 } else { 5755 Opc = AArch64::FMLAv2f32; 5756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5757 FMAInstKind::Accumulator); 5758 } 5759 break; 5760 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5761 case MachineCombinerPattern::FMLAv2f32_OP2: 5762 RC = &AArch64::FPR64RegClass; 5763 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 5764 Opc = AArch64::FMLAv2i32_indexed; 5765 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5766 FMAInstKind::Indexed); 5767 } else { 5768 Opc = AArch64::FMLAv2f32; 5769 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5770 FMAInstKind::Accumulator); 5771 } 5772 break; 5773 5774 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5775 RC = &AArch64::FPR128RegClass; 5776 Opc = AArch64::FMLAv8i16_indexed; 5777 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5778 FMAInstKind::Indexed); 5779 break; 5780 case MachineCombinerPattern::FMLAv8f16_OP1: 5781 RC = &AArch64::FPR128RegClass; 5782 Opc = AArch64::FMLAv8f16; 5783 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5784 FMAInstKind::Accumulator); 5785 break; 5786 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5787 RC = &AArch64::FPR128RegClass; 5788 Opc = AArch64::FMLAv8i16_indexed; 5789 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5790 FMAInstKind::Indexed); 5791 break; 5792 case MachineCombinerPattern::FMLAv8f16_OP2: 5793 RC = &AArch64::FPR128RegClass; 5794 Opc = AArch64::FMLAv8f16; 5795 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5796 FMAInstKind::Accumulator); 5797 break; 5798 5799 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5800 case MachineCombinerPattern::FMLAv2f64_OP1: 5801 RC = &AArch64::FPR128RegClass; 5802 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 5803 Opc = AArch64::FMLAv2i64_indexed; 5804 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5805 FMAInstKind::Indexed); 5806 } else { 5807 Opc = AArch64::FMLAv2f64; 5808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5809 FMAInstKind::Accumulator); 5810 } 5811 break; 5812 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5813 case MachineCombinerPattern::FMLAv2f64_OP2: 5814 RC = &AArch64::FPR128RegClass; 5815 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 5816 Opc = AArch64::FMLAv2i64_indexed; 5817 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5818 FMAInstKind::Indexed); 5819 } else { 5820 Opc = AArch64::FMLAv2f64; 5821 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5822 FMAInstKind::Accumulator); 5823 } 5824 break; 5825 5826 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5827 case MachineCombinerPattern::FMLAv4f32_OP1: 5828 RC = &AArch64::FPR128RegClass; 5829 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 5830 Opc = AArch64::FMLAv4i32_indexed; 5831 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5832 FMAInstKind::Indexed); 5833 } else { 5834 Opc = AArch64::FMLAv4f32; 5835 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5836 FMAInstKind::Accumulator); 5837 } 5838 break; 5839 5840 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5841 case MachineCombinerPattern::FMLAv4f32_OP2: 5842 RC = &AArch64::FPR128RegClass; 5843 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 5844 Opc = AArch64::FMLAv4i32_indexed; 5845 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5846 FMAInstKind::Indexed); 5847 } else { 5848 Opc = AArch64::FMLAv4f32; 5849 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5850 FMAInstKind::Accumulator); 5851 } 5852 break; 5853 5854 case MachineCombinerPattern::FMULSUBH_OP1: 5855 Opc = AArch64::FNMSUBHrrr; 5856 RC = &AArch64::FPR16RegClass; 5857 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5858 break; 5859 case MachineCombinerPattern::FMULSUBS_OP1: 5860 Opc = AArch64::FNMSUBSrrr; 5861 RC = &AArch64::FPR32RegClass; 5862 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5863 break; 5864 case MachineCombinerPattern::FMULSUBD_OP1: 5865 Opc = AArch64::FNMSUBDrrr; 5866 RC = &AArch64::FPR64RegClass; 5867 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5868 break; 5869 5870 case MachineCombinerPattern::FNMULSUBH_OP1: 5871 Opc = AArch64::FNMADDHrrr; 5872 RC = &AArch64::FPR16RegClass; 5873 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5874 break; 5875 case MachineCombinerPattern::FNMULSUBS_OP1: 5876 Opc = AArch64::FNMADDSrrr; 5877 RC = &AArch64::FPR32RegClass; 5878 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5879 break; 5880 case MachineCombinerPattern::FNMULSUBD_OP1: 5881 Opc = AArch64::FNMADDDrrr; 5882 RC = &AArch64::FPR64RegClass; 5883 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5884 break; 5885 5886 case MachineCombinerPattern::FMULSUBH_OP2: 5887 Opc = AArch64::FMSUBHrrr; 5888 RC = &AArch64::FPR16RegClass; 5889 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5890 break; 5891 case MachineCombinerPattern::FMULSUBS_OP2: 5892 Opc = AArch64::FMSUBSrrr; 5893 RC = &AArch64::FPR32RegClass; 5894 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5895 break; 5896 case MachineCombinerPattern::FMULSUBD_OP2: 5897 Opc = AArch64::FMSUBDrrr; 5898 RC = &AArch64::FPR64RegClass; 5899 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5900 break; 5901 5902 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5903 Opc = AArch64::FMLSv1i32_indexed; 5904 RC = &AArch64::FPR32RegClass; 5905 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5906 FMAInstKind::Indexed); 5907 break; 5908 5909 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5910 Opc = AArch64::FMLSv1i64_indexed; 5911 RC = &AArch64::FPR64RegClass; 5912 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5913 FMAInstKind::Indexed); 5914 break; 5915 5916 case MachineCombinerPattern::FMLSv4f16_OP1: 5917 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5918 RC = &AArch64::FPR64RegClass; 5919 Register NewVR = MRI.createVirtualRegister(RC); 5920 MachineInstrBuilder MIB1 = 5921 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5922 .add(Root.getOperand(2)); 5923 InsInstrs.push_back(MIB1); 5924 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5925 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5926 Opc = AArch64::FMLAv4f16; 5927 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5928 FMAInstKind::Accumulator, &NewVR); 5929 } else { 5930 Opc = AArch64::FMLAv4i16_indexed; 5931 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5932 FMAInstKind::Indexed, &NewVR); 5933 } 5934 break; 5935 } 5936 case MachineCombinerPattern::FMLSv4f16_OP2: 5937 RC = &AArch64::FPR64RegClass; 5938 Opc = AArch64::FMLSv4f16; 5939 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5940 FMAInstKind::Accumulator); 5941 break; 5942 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5943 RC = &AArch64::FPR64RegClass; 5944 Opc = AArch64::FMLSv4i16_indexed; 5945 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5946 FMAInstKind::Indexed); 5947 break; 5948 5949 case MachineCombinerPattern::FMLSv2f32_OP2: 5950 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5951 RC = &AArch64::FPR64RegClass; 5952 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5953 Opc = AArch64::FMLSv2i32_indexed; 5954 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5955 FMAInstKind::Indexed); 5956 } else { 5957 Opc = AArch64::FMLSv2f32; 5958 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5959 FMAInstKind::Accumulator); 5960 } 5961 break; 5962 5963 case MachineCombinerPattern::FMLSv8f16_OP1: 5964 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5965 RC = &AArch64::FPR128RegClass; 5966 Register NewVR = MRI.createVirtualRegister(RC); 5967 MachineInstrBuilder MIB1 = 5968 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5969 .add(Root.getOperand(2)); 5970 InsInstrs.push_back(MIB1); 5971 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5972 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5973 Opc = AArch64::FMLAv8f16; 5974 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5975 FMAInstKind::Accumulator, &NewVR); 5976 } else { 5977 Opc = AArch64::FMLAv8i16_indexed; 5978 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5979 FMAInstKind::Indexed, &NewVR); 5980 } 5981 break; 5982 } 5983 case MachineCombinerPattern::FMLSv8f16_OP2: 5984 RC = &AArch64::FPR128RegClass; 5985 Opc = AArch64::FMLSv8f16; 5986 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5987 FMAInstKind::Accumulator); 5988 break; 5989 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5990 RC = &AArch64::FPR128RegClass; 5991 Opc = AArch64::FMLSv8i16_indexed; 5992 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5993 FMAInstKind::Indexed); 5994 break; 5995 5996 case MachineCombinerPattern::FMLSv2f64_OP2: 5997 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5998 RC = &AArch64::FPR128RegClass; 5999 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 6000 Opc = AArch64::FMLSv2i64_indexed; 6001 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6002 FMAInstKind::Indexed); 6003 } else { 6004 Opc = AArch64::FMLSv2f64; 6005 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6006 FMAInstKind::Accumulator); 6007 } 6008 break; 6009 6010 case MachineCombinerPattern::FMLSv4f32_OP2: 6011 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 6012 RC = &AArch64::FPR128RegClass; 6013 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 6014 Opc = AArch64::FMLSv4i32_indexed; 6015 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6016 FMAInstKind::Indexed); 6017 } else { 6018 Opc = AArch64::FMLSv4f32; 6019 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 6020 FMAInstKind::Accumulator); 6021 } 6022 break; 6023 case MachineCombinerPattern::FMLSv2f32_OP1: 6024 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 6025 RC = &AArch64::FPR64RegClass; 6026 Register NewVR = MRI.createVirtualRegister(RC); 6027 MachineInstrBuilder MIB1 = 6028 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 6029 .add(Root.getOperand(2)); 6030 InsInstrs.push_back(MIB1); 6031 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6032 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 6033 Opc = AArch64::FMLAv2i32_indexed; 6034 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6035 FMAInstKind::Indexed, &NewVR); 6036 } else { 6037 Opc = AArch64::FMLAv2f32; 6038 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6039 FMAInstKind::Accumulator, &NewVR); 6040 } 6041 break; 6042 } 6043 case MachineCombinerPattern::FMLSv4f32_OP1: 6044 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 6045 RC = &AArch64::FPR128RegClass; 6046 Register NewVR = MRI.createVirtualRegister(RC); 6047 MachineInstrBuilder MIB1 = 6048 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 6049 .add(Root.getOperand(2)); 6050 InsInstrs.push_back(MIB1); 6051 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6052 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 6053 Opc = AArch64::FMLAv4i32_indexed; 6054 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6055 FMAInstKind::Indexed, &NewVR); 6056 } else { 6057 Opc = AArch64::FMLAv4f32; 6058 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6059 FMAInstKind::Accumulator, &NewVR); 6060 } 6061 break; 6062 } 6063 case MachineCombinerPattern::FMLSv2f64_OP1: 6064 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6065 RC = &AArch64::FPR128RegClass; 6066 Register NewVR = MRI.createVirtualRegister(RC); 6067 MachineInstrBuilder MIB1 = 6068 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 6069 .add(Root.getOperand(2)); 6070 InsInstrs.push_back(MIB1); 6071 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6072 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6073 Opc = AArch64::FMLAv2i64_indexed; 6074 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6075 FMAInstKind::Indexed, &NewVR); 6076 } else { 6077 Opc = AArch64::FMLAv2f64; 6078 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6079 FMAInstKind::Accumulator, &NewVR); 6080 } 6081 break; 6082 } 6083 } // end switch (Pattern) 6084 // Record MUL and ADD/SUB for deletion 6085 // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and 6086 // CodeGen/AArch64/urem-seteq-nonzero.ll. 6087 // assert(MUL && "MUL was never set"); 6088 DelInstrs.push_back(MUL); 6089 DelInstrs.push_back(&Root); 6090 } 6091 6092 /// Replace csincr-branch sequence by simple conditional branch 6093 /// 6094 /// Examples: 6095 /// 1. \code 6096 /// csinc w9, wzr, wzr, <condition code> 6097 /// tbnz w9, #0, 0x44 6098 /// \endcode 6099 /// to 6100 /// \code 6101 /// b.<inverted condition code> 6102 /// \endcode 6103 /// 6104 /// 2. \code 6105 /// csinc w9, wzr, wzr, <condition code> 6106 /// tbz w9, #0, 0x44 6107 /// \endcode 6108 /// to 6109 /// \code 6110 /// b.<condition code> 6111 /// \endcode 6112 /// 6113 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6114 /// compare's constant operand is power of 2. 6115 /// 6116 /// Examples: 6117 /// \code 6118 /// and w8, w8, #0x400 6119 /// cbnz w8, L1 6120 /// \endcode 6121 /// to 6122 /// \code 6123 /// tbnz w8, #10, L1 6124 /// \endcode 6125 /// 6126 /// \param MI Conditional Branch 6127 /// \return True when the simple conditional branch is generated 6128 /// 6129 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6130 bool IsNegativeBranch = false; 6131 bool IsTestAndBranch = false; 6132 unsigned TargetBBInMI = 0; 6133 switch (MI.getOpcode()) { 6134 default: 6135 llvm_unreachable("Unknown branch instruction?"); 6136 case AArch64::Bcc: 6137 return false; 6138 case AArch64::CBZW: 6139 case AArch64::CBZX: 6140 TargetBBInMI = 1; 6141 break; 6142 case AArch64::CBNZW: 6143 case AArch64::CBNZX: 6144 TargetBBInMI = 1; 6145 IsNegativeBranch = true; 6146 break; 6147 case AArch64::TBZW: 6148 case AArch64::TBZX: 6149 TargetBBInMI = 2; 6150 IsTestAndBranch = true; 6151 break; 6152 case AArch64::TBNZW: 6153 case AArch64::TBNZX: 6154 TargetBBInMI = 2; 6155 IsNegativeBranch = true; 6156 IsTestAndBranch = true; 6157 break; 6158 } 6159 // So we increment a zero register and test for bits other 6160 // than bit 0? Conservatively bail out in case the verifier 6161 // missed this case. 6162 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6163 return false; 6164 6165 // Find Definition. 6166 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6167 MachineBasicBlock *MBB = MI.getParent(); 6168 MachineFunction *MF = MBB->getParent(); 6169 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6170 Register VReg = MI.getOperand(0).getReg(); 6171 if (!Register::isVirtualRegister(VReg)) 6172 return false; 6173 6174 MachineInstr *DefMI = MRI->getVRegDef(VReg); 6175 6176 // Look through COPY instructions to find definition. 6177 while (DefMI->isCopy()) { 6178 Register CopyVReg = DefMI->getOperand(1).getReg(); 6179 if (!MRI->hasOneNonDBGUse(CopyVReg)) 6180 return false; 6181 if (!MRI->hasOneDef(CopyVReg)) 6182 return false; 6183 DefMI = MRI->getVRegDef(CopyVReg); 6184 } 6185 6186 switch (DefMI->getOpcode()) { 6187 default: 6188 return false; 6189 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 6190 case AArch64::ANDWri: 6191 case AArch64::ANDXri: { 6192 if (IsTestAndBranch) 6193 return false; 6194 if (DefMI->getParent() != MBB) 6195 return false; 6196 if (!MRI->hasOneNonDBGUse(VReg)) 6197 return false; 6198 6199 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 6200 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 6201 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 6202 if (!isPowerOf2_64(Mask)) 6203 return false; 6204 6205 MachineOperand &MO = DefMI->getOperand(1); 6206 Register NewReg = MO.getReg(); 6207 if (!Register::isVirtualRegister(NewReg)) 6208 return false; 6209 6210 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 6211 6212 MachineBasicBlock &RefToMBB = *MBB; 6213 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 6214 DebugLoc DL = MI.getDebugLoc(); 6215 unsigned Imm = Log2_64(Mask); 6216 unsigned Opc = (Imm < 32) 6217 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 6218 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 6219 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 6220 .addReg(NewReg) 6221 .addImm(Imm) 6222 .addMBB(TBB); 6223 // Register lives on to the CBZ now. 6224 MO.setIsKill(false); 6225 6226 // For immediate smaller than 32, we need to use the 32-bit 6227 // variant (W) in all cases. Indeed the 64-bit variant does not 6228 // allow to encode them. 6229 // Therefore, if the input register is 64-bit, we need to take the 6230 // 32-bit sub-part. 6231 if (!Is32Bit && Imm < 32) 6232 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 6233 MI.eraseFromParent(); 6234 return true; 6235 } 6236 // Look for CSINC 6237 case AArch64::CSINCWr: 6238 case AArch64::CSINCXr: { 6239 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 6240 DefMI->getOperand(2).getReg() == AArch64::WZR) && 6241 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 6242 DefMI->getOperand(2).getReg() == AArch64::XZR)) 6243 return false; 6244 6245 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 6246 return false; 6247 6248 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 6249 // Convert only when the condition code is not modified between 6250 // the CSINC and the branch. The CC may be used by other 6251 // instructions in between. 6252 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 6253 return false; 6254 MachineBasicBlock &RefToMBB = *MBB; 6255 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 6256 DebugLoc DL = MI.getDebugLoc(); 6257 if (IsNegativeBranch) 6258 CC = AArch64CC::getInvertedCondCode(CC); 6259 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 6260 MI.eraseFromParent(); 6261 return true; 6262 } 6263 } 6264 } 6265 6266 std::pair<unsigned, unsigned> 6267 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6268 const unsigned Mask = AArch64II::MO_FRAGMENT; 6269 return std::make_pair(TF & Mask, TF & ~Mask); 6270 } 6271 6272 ArrayRef<std::pair<unsigned, const char *>> 6273 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6274 using namespace AArch64II; 6275 6276 static const std::pair<unsigned, const char *> TargetFlags[] = { 6277 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 6278 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 6279 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 6280 {MO_HI12, "aarch64-hi12"}}; 6281 return makeArrayRef(TargetFlags); 6282 } 6283 6284 ArrayRef<std::pair<unsigned, const char *>> 6285 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 6286 using namespace AArch64II; 6287 6288 static const std::pair<unsigned, const char *> TargetFlags[] = { 6289 {MO_COFFSTUB, "aarch64-coffstub"}, 6290 {MO_GOT, "aarch64-got"}, 6291 {MO_NC, "aarch64-nc"}, 6292 {MO_S, "aarch64-s"}, 6293 {MO_TLS, "aarch64-tls"}, 6294 {MO_DLLIMPORT, "aarch64-dllimport"}, 6295 {MO_PREL, "aarch64-prel"}, 6296 {MO_TAGGED, "aarch64-tagged"}}; 6297 return makeArrayRef(TargetFlags); 6298 } 6299 6300 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 6301 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 6302 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 6303 {{MOSuppressPair, "aarch64-suppress-pair"}, 6304 {MOStridedAccess, "aarch64-strided-access"}}; 6305 return makeArrayRef(TargetFlags); 6306 } 6307 6308 /// Constants defining how certain sequences should be outlined. 6309 /// This encompasses how an outlined function should be called, and what kind of 6310 /// frame should be emitted for that outlined function. 6311 /// 6312 /// \p MachineOutlinerDefault implies that the function should be called with 6313 /// a save and restore of LR to the stack. 6314 /// 6315 /// That is, 6316 /// 6317 /// I1 Save LR OUTLINED_FUNCTION: 6318 /// I2 --> BL OUTLINED_FUNCTION I1 6319 /// I3 Restore LR I2 6320 /// I3 6321 /// RET 6322 /// 6323 /// * Call construction overhead: 3 (save + BL + restore) 6324 /// * Frame construction overhead: 1 (ret) 6325 /// * Requires stack fixups? Yes 6326 /// 6327 /// \p MachineOutlinerTailCall implies that the function is being created from 6328 /// a sequence of instructions ending in a return. 6329 /// 6330 /// That is, 6331 /// 6332 /// I1 OUTLINED_FUNCTION: 6333 /// I2 --> B OUTLINED_FUNCTION I1 6334 /// RET I2 6335 /// RET 6336 /// 6337 /// * Call construction overhead: 1 (B) 6338 /// * Frame construction overhead: 0 (Return included in sequence) 6339 /// * Requires stack fixups? No 6340 /// 6341 /// \p MachineOutlinerNoLRSave implies that the function should be called using 6342 /// a BL instruction, but doesn't require LR to be saved and restored. This 6343 /// happens when LR is known to be dead. 6344 /// 6345 /// That is, 6346 /// 6347 /// I1 OUTLINED_FUNCTION: 6348 /// I2 --> BL OUTLINED_FUNCTION I1 6349 /// I3 I2 6350 /// I3 6351 /// RET 6352 /// 6353 /// * Call construction overhead: 1 (BL) 6354 /// * Frame construction overhead: 1 (RET) 6355 /// * Requires stack fixups? No 6356 /// 6357 /// \p MachineOutlinerThunk implies that the function is being created from 6358 /// a sequence of instructions ending in a call. The outlined function is 6359 /// called with a BL instruction, and the outlined function tail-calls the 6360 /// original call destination. 6361 /// 6362 /// That is, 6363 /// 6364 /// I1 OUTLINED_FUNCTION: 6365 /// I2 --> BL OUTLINED_FUNCTION I1 6366 /// BL f I2 6367 /// B f 6368 /// * Call construction overhead: 1 (BL) 6369 /// * Frame construction overhead: 0 6370 /// * Requires stack fixups? No 6371 /// 6372 /// \p MachineOutlinerRegSave implies that the function should be called with a 6373 /// save and restore of LR to an available register. This allows us to avoid 6374 /// stack fixups. Note that this outlining variant is compatible with the 6375 /// NoLRSave case. 6376 /// 6377 /// That is, 6378 /// 6379 /// I1 Save LR OUTLINED_FUNCTION: 6380 /// I2 --> BL OUTLINED_FUNCTION I1 6381 /// I3 Restore LR I2 6382 /// I3 6383 /// RET 6384 /// 6385 /// * Call construction overhead: 3 (save + BL + restore) 6386 /// * Frame construction overhead: 1 (ret) 6387 /// * Requires stack fixups? No 6388 enum MachineOutlinerClass { 6389 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 6390 MachineOutlinerTailCall, /// Only emit a branch. 6391 MachineOutlinerNoLRSave, /// Emit a call and return. 6392 MachineOutlinerThunk, /// Emit a call and tail-call. 6393 MachineOutlinerRegSave /// Same as default, but save to a register. 6394 }; 6395 6396 enum MachineOutlinerMBBFlags { 6397 LRUnavailableSomewhere = 0x2, 6398 HasCalls = 0x4, 6399 UnsafeRegsDead = 0x8 6400 }; 6401 6402 unsigned 6403 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 6404 assert(C.LRUWasSet && "LRU wasn't set?"); 6405 MachineFunction *MF = C.getMF(); 6406 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6407 MF->getSubtarget().getRegisterInfo()); 6408 6409 // Check if there is an available register across the sequence that we can 6410 // use. 6411 for (unsigned Reg : AArch64::GPR64RegClass) { 6412 if (!ARI->isReservedReg(*MF, Reg) && 6413 Reg != AArch64::LR && // LR is not reserved, but don't use it. 6414 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 6415 Reg != AArch64::X17 && // Ditto for X17. 6416 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 6417 return Reg; 6418 } 6419 6420 // No suitable register. Return 0. 6421 return 0u; 6422 } 6423 6424 static bool 6425 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 6426 const outliner::Candidate &b) { 6427 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6428 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6429 6430 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 6431 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 6432 } 6433 6434 static bool 6435 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 6436 const outliner::Candidate &b) { 6437 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6438 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6439 6440 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 6441 } 6442 6443 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 6444 const outliner::Candidate &b) { 6445 const AArch64Subtarget &SubtargetA = 6446 a.getMF()->getSubtarget<AArch64Subtarget>(); 6447 const AArch64Subtarget &SubtargetB = 6448 b.getMF()->getSubtarget<AArch64Subtarget>(); 6449 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 6450 } 6451 6452 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 6453 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 6454 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 6455 unsigned SequenceSize = 6456 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 6457 [this](unsigned Sum, const MachineInstr &MI) { 6458 return Sum + getInstSizeInBytes(MI); 6459 }); 6460 unsigned NumBytesToCreateFrame = 0; 6461 6462 // We only allow outlining for functions having exactly matching return 6463 // address signing attributes, i.e., all share the same value for the 6464 // attribute "sign-return-address" and all share the same type of key they 6465 // are signed with. 6466 // Additionally we require all functions to simultaniously either support 6467 // v8.3a features or not. Otherwise an outlined function could get signed 6468 // using dedicated v8.3 instructions and a call from a function that doesn't 6469 // support v8.3 instructions would therefore be invalid. 6470 if (std::adjacent_find( 6471 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6472 [](const outliner::Candidate &a, const outliner::Candidate &b) { 6473 // Return true if a and b are non-equal w.r.t. return address 6474 // signing or support of v8.3a features 6475 if (outliningCandidatesSigningScopeConsensus(a, b) && 6476 outliningCandidatesSigningKeyConsensus(a, b) && 6477 outliningCandidatesV8_3OpsConsensus(a, b)) { 6478 return false; 6479 } 6480 return true; 6481 }) != RepeatedSequenceLocs.end()) { 6482 return outliner::OutlinedFunction(); 6483 } 6484 6485 // Since at this point all candidates agree on their return address signing 6486 // picking just one is fine. If the candidate functions potentially sign their 6487 // return addresses, the outlined function should do the same. Note that in 6488 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 6489 // not certainly true that the outlined function will have to sign its return 6490 // address but this decision is made later, when the decision to outline 6491 // has already been made. 6492 // The same holds for the number of additional instructions we need: On 6493 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 6494 // necessary. However, at this point we don't know if the outlined function 6495 // will have a RET instruction so we assume the worst. 6496 const TargetRegisterInfo &TRI = getRegisterInfo(); 6497 if (FirstCand.getMF() 6498 ->getInfo<AArch64FunctionInfo>() 6499 ->shouldSignReturnAddress(true)) { 6500 // One PAC and one AUT instructions 6501 NumBytesToCreateFrame += 8; 6502 6503 // We have to check if sp modifying instructions would get outlined. 6504 // If so we only allow outlining if sp is unchanged overall, so matching 6505 // sub and add instructions are okay to outline, all other sp modifications 6506 // are not 6507 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 6508 int SPValue = 0; 6509 MachineBasicBlock::iterator MBBI = C.front(); 6510 for (;;) { 6511 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 6512 switch (MBBI->getOpcode()) { 6513 case AArch64::ADDXri: 6514 case AArch64::ADDWri: 6515 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6516 assert(MBBI->getOperand(2).isImm() && 6517 "Expected operand to be immediate"); 6518 assert(MBBI->getOperand(1).isReg() && 6519 "Expected operand to be a register"); 6520 // Check if the add just increments sp. If so, we search for 6521 // matching sub instructions that decrement sp. If not, the 6522 // modification is illegal 6523 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6524 SPValue += MBBI->getOperand(2).getImm(); 6525 else 6526 return true; 6527 break; 6528 case AArch64::SUBXri: 6529 case AArch64::SUBWri: 6530 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6531 assert(MBBI->getOperand(2).isImm() && 6532 "Expected operand to be immediate"); 6533 assert(MBBI->getOperand(1).isReg() && 6534 "Expected operand to be a register"); 6535 // Check if the sub just decrements sp. If so, we search for 6536 // matching add instructions that increment sp. If not, the 6537 // modification is illegal 6538 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6539 SPValue -= MBBI->getOperand(2).getImm(); 6540 else 6541 return true; 6542 break; 6543 default: 6544 return true; 6545 } 6546 } 6547 if (MBBI == C.back()) 6548 break; 6549 ++MBBI; 6550 } 6551 if (SPValue) 6552 return true; 6553 return false; 6554 }; 6555 // Remove candidates with illegal stack modifying instructions 6556 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 6557 6558 // If the sequence doesn't have enough candidates left, then we're done. 6559 if (RepeatedSequenceLocs.size() < 2) 6560 return outliner::OutlinedFunction(); 6561 } 6562 6563 // Properties about candidate MBBs that hold for all of them. 6564 unsigned FlagsSetInAll = 0xF; 6565 6566 // Compute liveness information for each candidate, and set FlagsSetInAll. 6567 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6568 [&FlagsSetInAll](outliner::Candidate &C) { 6569 FlagsSetInAll &= C.Flags; 6570 }); 6571 6572 // According to the AArch64 Procedure Call Standard, the following are 6573 // undefined on entry/exit from a function call: 6574 // 6575 // * Registers x16, x17, (and thus w16, w17) 6576 // * Condition codes (and thus the NZCV register) 6577 // 6578 // Because if this, we can't outline any sequence of instructions where 6579 // one 6580 // of these registers is live into/across it. Thus, we need to delete 6581 // those 6582 // candidates. 6583 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 6584 // If the unsafe registers in this block are all dead, then we don't need 6585 // to compute liveness here. 6586 if (C.Flags & UnsafeRegsDead) 6587 return false; 6588 C.initLRU(TRI); 6589 LiveRegUnits LRU = C.LRU; 6590 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 6591 !LRU.available(AArch64::NZCV)); 6592 }; 6593 6594 // Are there any candidates where those registers are live? 6595 if (!(FlagsSetInAll & UnsafeRegsDead)) { 6596 // Erase every candidate that violates the restrictions above. (It could be 6597 // true that we have viable candidates, so it's not worth bailing out in 6598 // the case that, say, 1 out of 20 candidates violate the restructions.) 6599 llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); 6600 6601 // If the sequence doesn't have enough candidates left, then we're done. 6602 if (RepeatedSequenceLocs.size() < 2) 6603 return outliner::OutlinedFunction(); 6604 } 6605 6606 // At this point, we have only "safe" candidates to outline. Figure out 6607 // frame + call instruction information. 6608 6609 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 6610 6611 // Helper lambda which sets call information for every candidate. 6612 auto SetCandidateCallInfo = 6613 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 6614 for (outliner::Candidate &C : RepeatedSequenceLocs) 6615 C.setCallInfo(CallID, NumBytesForCall); 6616 }; 6617 6618 unsigned FrameID = MachineOutlinerDefault; 6619 NumBytesToCreateFrame += 4; 6620 6621 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 6622 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 6623 }); 6624 6625 // We check to see if CFI Instructions are present, and if they are 6626 // we find the number of CFI Instructions in the candidates. 6627 unsigned CFICount = 0; 6628 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); 6629 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); 6630 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { 6631 const std::vector<MCCFIInstruction> &CFIInstructions = 6632 RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); 6633 if (MBBI->isCFIInstruction()) { 6634 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); 6635 MCCFIInstruction CFI = CFIInstructions[CFIIndex]; 6636 CFICount++; 6637 } 6638 MBBI++; 6639 } 6640 6641 // We compare the number of found CFI Instructions to the number of CFI 6642 // instructions in the parent function for each candidate. We must check this 6643 // since if we outline one of the CFI instructions in a function, we have to 6644 // outline them all for correctness. If we do not, the address offsets will be 6645 // incorrect between the two sections of the program. 6646 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6647 std::vector<MCCFIInstruction> CFIInstructions = 6648 C.getMF()->getFrameInstructions(); 6649 6650 if (CFICount > 0 && CFICount != CFIInstructions.size()) 6651 return outliner::OutlinedFunction(); 6652 } 6653 6654 // Returns true if an instructions is safe to fix up, false otherwise. 6655 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 6656 if (MI.isCall()) 6657 return true; 6658 6659 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 6660 !MI.readsRegister(AArch64::SP, &TRI)) 6661 return true; 6662 6663 // Any modification of SP will break our code to save/restore LR. 6664 // FIXME: We could handle some instructions which add a constant 6665 // offset to SP, with a bit more work. 6666 if (MI.modifiesRegister(AArch64::SP, &TRI)) 6667 return false; 6668 6669 // At this point, we have a stack instruction that we might need to 6670 // fix up. We'll handle it if it's a load or store. 6671 if (MI.mayLoadOrStore()) { 6672 const MachineOperand *Base; // Filled with the base operand of MI. 6673 int64_t Offset; // Filled with the offset of MI. 6674 bool OffsetIsScalable; 6675 6676 // Does it allow us to offset the base operand and is the base the 6677 // register SP? 6678 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 6679 !Base->isReg() || Base->getReg() != AArch64::SP) 6680 return false; 6681 6682 // Fixe-up code below assumes bytes. 6683 if (OffsetIsScalable) 6684 return false; 6685 6686 // Find the minimum/maximum offset for this instruction and check 6687 // if fixing it up would be in range. 6688 int64_t MinOffset, 6689 MaxOffset; // Unscaled offsets for the instruction. 6690 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 6691 unsigned DummyWidth; 6692 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 6693 6694 Offset += 16; // Update the offset to what it would be if we outlined. 6695 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || 6696 Offset > MaxOffset * (int64_t)Scale.getFixedSize()) 6697 return false; 6698 6699 // It's in range, so we can outline it. 6700 return true; 6701 } 6702 6703 // FIXME: Add handling for instructions like "add x0, sp, #8". 6704 6705 // We can't fix it up, so don't outline it. 6706 return false; 6707 }; 6708 6709 // True if it's possible to fix up each stack instruction in this sequence. 6710 // Important for frames/call variants that modify the stack. 6711 bool AllStackInstrsSafe = std::all_of( 6712 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 6713 6714 // If the last instruction in any candidate is a terminator, then we should 6715 // tail call all of the candidates. 6716 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 6717 FrameID = MachineOutlinerTailCall; 6718 NumBytesToCreateFrame = 0; 6719 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 6720 } 6721 6722 else if (LastInstrOpcode == AArch64::BL || 6723 ((LastInstrOpcode == AArch64::BLR || 6724 LastInstrOpcode == AArch64::BLRNoIP) && 6725 !HasBTI)) { 6726 // FIXME: Do we need to check if the code after this uses the value of LR? 6727 FrameID = MachineOutlinerThunk; 6728 NumBytesToCreateFrame = 0; 6729 SetCandidateCallInfo(MachineOutlinerThunk, 4); 6730 } 6731 6732 else { 6733 // We need to decide how to emit calls + frames. We can always emit the same 6734 // frame if we don't need to save to the stack. If we have to save to the 6735 // stack, then we need a different frame. 6736 unsigned NumBytesNoStackCalls = 0; 6737 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 6738 6739 // Check if we have to save LR. 6740 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6741 C.initLRU(TRI); 6742 6743 // If we have a noreturn caller, then we're going to be conservative and 6744 // say that we have to save LR. If we don't have a ret at the end of the 6745 // block, then we can't reason about liveness accurately. 6746 // 6747 // FIXME: We can probably do better than always disabling this in 6748 // noreturn functions by fixing up the liveness info. 6749 bool IsNoReturn = 6750 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 6751 6752 // Is LR available? If so, we don't need a save. 6753 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 6754 NumBytesNoStackCalls += 4; 6755 C.setCallInfo(MachineOutlinerNoLRSave, 4); 6756 CandidatesWithoutStackFixups.push_back(C); 6757 } 6758 6759 // Is an unused register available? If so, we won't modify the stack, so 6760 // we can outline with the same frame type as those that don't save LR. 6761 else if (findRegisterToSaveLRTo(C)) { 6762 NumBytesNoStackCalls += 12; 6763 C.setCallInfo(MachineOutlinerRegSave, 12); 6764 CandidatesWithoutStackFixups.push_back(C); 6765 } 6766 6767 // Is SP used in the sequence at all? If not, we don't have to modify 6768 // the stack, so we are guaranteed to get the same frame. 6769 else if (C.UsedInSequence.available(AArch64::SP)) { 6770 NumBytesNoStackCalls += 12; 6771 C.setCallInfo(MachineOutlinerDefault, 12); 6772 CandidatesWithoutStackFixups.push_back(C); 6773 } 6774 6775 // If we outline this, we need to modify the stack. Pretend we don't 6776 // outline this by saving all of its bytes. 6777 else { 6778 NumBytesNoStackCalls += SequenceSize; 6779 } 6780 } 6781 6782 // If there are no places where we have to save LR, then note that we 6783 // don't have to update the stack. Otherwise, give every candidate the 6784 // default call type, as long as it's safe to do so. 6785 if (!AllStackInstrsSafe || 6786 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 6787 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 6788 FrameID = MachineOutlinerNoLRSave; 6789 } else { 6790 SetCandidateCallInfo(MachineOutlinerDefault, 12); 6791 6792 // Bugzilla ID: 46767 6793 // TODO: Check if fixing up the stack more than once is safe so we can 6794 // outline these. 6795 // 6796 // An outline resulting in a caller that requires stack fixups at the 6797 // callsite to a callee that also requires stack fixups can happen when 6798 // there are no available registers at the candidate callsite for a 6799 // candidate that itself also has calls. 6800 // 6801 // In other words if function_containing_sequence in the following pseudo 6802 // assembly requires that we save LR at the point of the call, but there 6803 // are no available registers: in this case we save using SP and as a 6804 // result the SP offsets requires stack fixups by multiples of 16. 6805 // 6806 // function_containing_sequence: 6807 // ... 6808 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6809 // call OUTLINED_FUNCTION_N 6810 // restore LR from SP 6811 // ... 6812 // 6813 // OUTLINED_FUNCTION_N: 6814 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6815 // ... 6816 // bl foo 6817 // restore LR from SP 6818 // ret 6819 // 6820 // Because the code to handle more than one stack fixup does not 6821 // currently have the proper checks for legality, these cases will assert 6822 // in the AArch64 MachineOutliner. This is because the code to do this 6823 // needs more hardening, testing, better checks that generated code is 6824 // legal, etc and because it is only verified to handle a single pass of 6825 // stack fixup. 6826 // 6827 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 6828 // these cases until they are known to be handled. Bugzilla 46767 is 6829 // referenced in comments at the assert site. 6830 // 6831 // To avoid asserting (or generating non-legal code on noassert builds) 6832 // we remove all candidates which would need more than one stack fixup by 6833 // pruning the cases where the candidate has calls while also having no 6834 // available LR and having no available general purpose registers to copy 6835 // LR to (ie one extra stack save/restore). 6836 // 6837 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6838 erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { 6839 return (std::any_of( 6840 C.front(), std::next(C.back()), 6841 [](const MachineInstr &MI) { return MI.isCall(); })) && 6842 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); 6843 }); 6844 } 6845 } 6846 6847 // If we dropped all of the candidates, bail out here. 6848 if (RepeatedSequenceLocs.size() < 2) { 6849 RepeatedSequenceLocs.clear(); 6850 return outliner::OutlinedFunction(); 6851 } 6852 } 6853 6854 // Does every candidate's MBB contain a call? If so, then we might have a call 6855 // in the range. 6856 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6857 // Check if the range contains a call. These require a save + restore of the 6858 // link register. 6859 bool ModStackToSaveLR = false; 6860 if (std::any_of(FirstCand.front(), FirstCand.back(), 6861 [](const MachineInstr &MI) { return MI.isCall(); })) 6862 ModStackToSaveLR = true; 6863 6864 // Handle the last instruction separately. If this is a tail call, then the 6865 // last instruction is a call. We don't want to save + restore in this case. 6866 // However, it could be possible that the last instruction is a call without 6867 // it being valid to tail call this sequence. We should consider this as 6868 // well. 6869 else if (FrameID != MachineOutlinerThunk && 6870 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 6871 ModStackToSaveLR = true; 6872 6873 if (ModStackToSaveLR) { 6874 // We can't fix up the stack. Bail out. 6875 if (!AllStackInstrsSafe) { 6876 RepeatedSequenceLocs.clear(); 6877 return outliner::OutlinedFunction(); 6878 } 6879 6880 // Save + restore LR. 6881 NumBytesToCreateFrame += 8; 6882 } 6883 } 6884 6885 // If we have CFI instructions, we can only outline if the outlined section 6886 // can be a tail call 6887 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 6888 return outliner::OutlinedFunction(); 6889 6890 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 6891 NumBytesToCreateFrame, FrameID); 6892 } 6893 6894 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 6895 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 6896 const Function &F = MF.getFunction(); 6897 6898 // Can F be deduplicated by the linker? If it can, don't outline from it. 6899 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 6900 return false; 6901 6902 // Don't outline from functions with section markings; the program could 6903 // expect that all the code is in the named section. 6904 // FIXME: Allow outlining from multiple functions with the same section 6905 // marking. 6906 if (F.hasSection()) 6907 return false; 6908 6909 // Outlining from functions with redzones is unsafe since the outliner may 6910 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 6911 // outline from it. 6912 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 6913 if (!AFI || AFI->hasRedZone().getValueOr(true)) 6914 return false; 6915 6916 // FIXME: Teach the outliner to generate/handle Windows unwind info. 6917 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 6918 return false; 6919 6920 // It's safe to outline from MF. 6921 return true; 6922 } 6923 6924 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 6925 unsigned &Flags) const { 6926 // Check if LR is available through all of the MBB. If it's not, then set 6927 // a flag. 6928 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 6929 "Suitable Machine Function for outlining must track liveness"); 6930 LiveRegUnits LRU(getRegisterInfo()); 6931 6932 std::for_each(MBB.rbegin(), MBB.rend(), 6933 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6934 6935 // Check if each of the unsafe registers are available... 6936 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6937 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6938 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6939 6940 // If all of these are dead (and not live out), we know we don't have to check 6941 // them later. 6942 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6943 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6944 6945 // Now, add the live outs to the set. 6946 LRU.addLiveOuts(MBB); 6947 6948 // If any of these registers is available in the MBB, but also a live out of 6949 // the block, then we know outlining is unsafe. 6950 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6951 return false; 6952 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6953 return false; 6954 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6955 return false; 6956 6957 // Check if there's a call inside this MachineBasicBlock. If there is, then 6958 // set a flag. 6959 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6960 Flags |= MachineOutlinerMBBFlags::HasCalls; 6961 6962 MachineFunction *MF = MBB.getParent(); 6963 6964 // In the event that we outline, we may have to save LR. If there is an 6965 // available register in the MBB, then we'll always save LR there. Check if 6966 // this is true. 6967 bool CanSaveLR = false; 6968 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6969 MF->getSubtarget().getRegisterInfo()); 6970 6971 // Check if there is an available register across the sequence that we can 6972 // use. 6973 for (unsigned Reg : AArch64::GPR64RegClass) { 6974 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6975 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6976 CanSaveLR = true; 6977 break; 6978 } 6979 } 6980 6981 // Check if we have a register we can save LR to, and if LR was used 6982 // somewhere. If both of those things are true, then we need to evaluate the 6983 // safety of outlining stack instructions later. 6984 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6985 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6986 6987 return true; 6988 } 6989 6990 outliner::InstrType 6991 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6992 unsigned Flags) const { 6993 MachineInstr &MI = *MIT; 6994 MachineBasicBlock *MBB = MI.getParent(); 6995 MachineFunction *MF = MBB->getParent(); 6996 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6997 6998 // Don't outline anything used for return address signing. The outlined 6999 // function will get signed later if needed 7000 switch (MI.getOpcode()) { 7001 case AArch64::PACIASP: 7002 case AArch64::PACIBSP: 7003 case AArch64::AUTIASP: 7004 case AArch64::AUTIBSP: 7005 case AArch64::RETAA: 7006 case AArch64::RETAB: 7007 case AArch64::EMITBKEY: 7008 return outliner::InstrType::Illegal; 7009 } 7010 7011 // Don't outline LOHs. 7012 if (FuncInfo->getLOHRelated().count(&MI)) 7013 return outliner::InstrType::Illegal; 7014 7015 // We can only outline these if we will tail call the outlined function, or 7016 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 7017 // in a tail call. 7018 // 7019 // FIXME: If the proper fixups for the offset are implemented, this should be 7020 // possible. 7021 if (MI.isCFIInstruction()) 7022 return outliner::InstrType::Legal; 7023 7024 // Don't allow debug values to impact outlining type. 7025 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 7026 return outliner::InstrType::Invisible; 7027 7028 // At this point, KILL instructions don't really tell us much so we can go 7029 // ahead and skip over them. 7030 if (MI.isKill()) 7031 return outliner::InstrType::Invisible; 7032 7033 // Is this a terminator for a basic block? 7034 if (MI.isTerminator()) { 7035 7036 // Is this the end of a function? 7037 if (MI.getParent()->succ_empty()) 7038 return outliner::InstrType::Legal; 7039 7040 // It's not, so don't outline it. 7041 return outliner::InstrType::Illegal; 7042 } 7043 7044 // Make sure none of the operands are un-outlinable. 7045 for (const MachineOperand &MOP : MI.operands()) { 7046 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 7047 MOP.isTargetIndex()) 7048 return outliner::InstrType::Illegal; 7049 7050 // If it uses LR or W30 explicitly, then don't touch it. 7051 if (MOP.isReg() && !MOP.isImplicit() && 7052 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 7053 return outliner::InstrType::Illegal; 7054 } 7055 7056 // Special cases for instructions that can always be outlined, but will fail 7057 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 7058 // be outlined because they don't require a *specific* value to be in LR. 7059 if (MI.getOpcode() == AArch64::ADRP) 7060 return outliner::InstrType::Legal; 7061 7062 // If MI is a call we might be able to outline it. We don't want to outline 7063 // any calls that rely on the position of items on the stack. When we outline 7064 // something containing a call, we have to emit a save and restore of LR in 7065 // the outlined function. Currently, this always happens by saving LR to the 7066 // stack. Thus, if we outline, say, half the parameters for a function call 7067 // plus the call, then we'll break the callee's expectations for the layout 7068 // of the stack. 7069 // 7070 // FIXME: Allow calls to functions which construct a stack frame, as long 7071 // as they don't access arguments on the stack. 7072 // FIXME: Figure out some way to analyze functions defined in other modules. 7073 // We should be able to compute the memory usage based on the IR calling 7074 // convention, even if we can't see the definition. 7075 if (MI.isCall()) { 7076 // Get the function associated with the call. Look at each operand and find 7077 // the one that represents the callee and get its name. 7078 const Function *Callee = nullptr; 7079 for (const MachineOperand &MOP : MI.operands()) { 7080 if (MOP.isGlobal()) { 7081 Callee = dyn_cast<Function>(MOP.getGlobal()); 7082 break; 7083 } 7084 } 7085 7086 // Never outline calls to mcount. There isn't any rule that would require 7087 // this, but the Linux kernel's "ftrace" feature depends on it. 7088 if (Callee && Callee->getName() == "\01_mcount") 7089 return outliner::InstrType::Illegal; 7090 7091 // If we don't know anything about the callee, assume it depends on the 7092 // stack layout of the caller. In that case, it's only legal to outline 7093 // as a tail-call. Explicitly list the call instructions we know about so we 7094 // don't get unexpected results with call pseudo-instructions. 7095 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7096 if (MI.getOpcode() == AArch64::BLR || 7097 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7098 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7099 7100 if (!Callee) 7101 return UnknownCallOutlineType; 7102 7103 // We have a function we have information about. Check it if it's something 7104 // can safely outline. 7105 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7106 7107 // We don't know what's going on with the callee at all. Don't touch it. 7108 if (!CalleeMF) 7109 return UnknownCallOutlineType; 7110 7111 // Check if we know anything about the callee saves on the function. If we 7112 // don't, then don't touch it, since that implies that we haven't 7113 // computed anything about its stack frame yet. 7114 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7115 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7116 MFI.getNumObjects() > 0) 7117 return UnknownCallOutlineType; 7118 7119 // At this point, we can say that CalleeMF ought to not pass anything on the 7120 // stack. Therefore, we can outline it. 7121 return outliner::InstrType::Legal; 7122 } 7123 7124 // Don't outline positions. 7125 if (MI.isPosition()) 7126 return outliner::InstrType::Illegal; 7127 7128 // Don't touch the link register or W30. 7129 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7130 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7131 return outliner::InstrType::Illegal; 7132 7133 // Don't outline BTI instructions, because that will prevent the outlining 7134 // site from being indirectly callable. 7135 if (MI.getOpcode() == AArch64::HINT) { 7136 int64_t Imm = MI.getOperand(0).getImm(); 7137 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7138 return outliner::InstrType::Illegal; 7139 } 7140 7141 return outliner::InstrType::Legal; 7142 } 7143 7144 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7145 for (MachineInstr &MI : MBB) { 7146 const MachineOperand *Base; 7147 unsigned Width; 7148 int64_t Offset; 7149 bool OffsetIsScalable; 7150 7151 // Is this a load or store with an immediate offset with SP as the base? 7152 if (!MI.mayLoadOrStore() || 7153 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7154 &RI) || 7155 (Base->isReg() && Base->getReg() != AArch64::SP)) 7156 continue; 7157 7158 // It is, so we have to fix it up. 7159 TypeSize Scale(0U, false); 7160 int64_t Dummy1, Dummy2; 7161 7162 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7163 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7164 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7165 assert(Scale != 0 && "Unexpected opcode!"); 7166 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7167 7168 // We've pushed the return address to the stack, so add 16 to the offset. 7169 // This is safe, since we already checked if it would overflow when we 7170 // checked if this instruction was legal to outline. 7171 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); 7172 StackOffsetOperand.setImm(NewImm); 7173 } 7174 } 7175 7176 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7177 bool ShouldSignReturnAddr, 7178 bool ShouldSignReturnAddrWithAKey) { 7179 if (ShouldSignReturnAddr) { 7180 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7181 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7182 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7183 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7184 DebugLoc DL; 7185 7186 if (MBBAUT != MBB.end()) 7187 DL = MBBAUT->getDebugLoc(); 7188 7189 // At the very beginning of the basic block we insert the following 7190 // depending on the key type 7191 // 7192 // a_key: b_key: 7193 // PACIASP EMITBKEY 7194 // CFI_INSTRUCTION PACIBSP 7195 // CFI_INSTRUCTION 7196 unsigned PACI; 7197 if (ShouldSignReturnAddrWithAKey) { 7198 PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP; 7199 } else { 7200 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 7201 .setMIFlag(MachineInstr::FrameSetup); 7202 PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP; 7203 } 7204 7205 auto MI = BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(PACI)); 7206 if (Subtarget.hasPAuth()) 7207 MI.addReg(AArch64::LR, RegState::Define) 7208 .addReg(AArch64::LR) 7209 .addReg(AArch64::SP, RegState::InternalRead); 7210 MI.setMIFlag(MachineInstr::FrameSetup); 7211 7212 unsigned CFIIndex = 7213 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 7214 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 7215 .addCFIIndex(CFIIndex) 7216 .setMIFlags(MachineInstr::FrameSetup); 7217 7218 // If v8.3a features are available we can replace a RET instruction by 7219 // RETAA or RETAB and omit the AUT instructions 7220 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 7221 MBBAUT->getOpcode() == AArch64::RET) { 7222 BuildMI(MBB, MBBAUT, DL, 7223 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 7224 : AArch64::RETAB)) 7225 .copyImplicitOps(*MBBAUT); 7226 MBB.erase(MBBAUT); 7227 } else { 7228 BuildMI(MBB, MBBAUT, DL, 7229 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 7230 : AArch64::AUTIBSP)) 7231 .setMIFlag(MachineInstr::FrameDestroy); 7232 } 7233 } 7234 } 7235 7236 void AArch64InstrInfo::buildOutlinedFrame( 7237 MachineBasicBlock &MBB, MachineFunction &MF, 7238 const outliner::OutlinedFunction &OF) const { 7239 7240 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 7241 7242 if (OF.FrameConstructionID == MachineOutlinerTailCall) 7243 FI->setOutliningStyle("Tail Call"); 7244 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 7245 // For thunk outlining, rewrite the last instruction from a call to a 7246 // tail-call. 7247 MachineInstr *Call = &*--MBB.instr_end(); 7248 unsigned TailOpcode; 7249 if (Call->getOpcode() == AArch64::BL) { 7250 TailOpcode = AArch64::TCRETURNdi; 7251 } else { 7252 assert(Call->getOpcode() == AArch64::BLR || 7253 Call->getOpcode() == AArch64::BLRNoIP); 7254 TailOpcode = AArch64::TCRETURNriALL; 7255 } 7256 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 7257 .add(Call->getOperand(0)) 7258 .addImm(0); 7259 MBB.insert(MBB.end(), TC); 7260 Call->eraseFromParent(); 7261 7262 FI->setOutliningStyle("Thunk"); 7263 } 7264 7265 bool IsLeafFunction = true; 7266 7267 // Is there a call in the outlined range? 7268 auto IsNonTailCall = [](const MachineInstr &MI) { 7269 return MI.isCall() && !MI.isReturn(); 7270 }; 7271 7272 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 7273 // Fix up the instructions in the range, since we're going to modify the 7274 // stack. 7275 7276 // Bugzilla ID: 46767 7277 // TODO: Check if fixing up twice is safe so we can outline these. 7278 assert(OF.FrameConstructionID != MachineOutlinerDefault && 7279 "Can only fix up stack references once"); 7280 fixupPostOutline(MBB); 7281 7282 IsLeafFunction = false; 7283 7284 // LR has to be a live in so that we can save it. 7285 if (!MBB.isLiveIn(AArch64::LR)) 7286 MBB.addLiveIn(AArch64::LR); 7287 7288 MachineBasicBlock::iterator It = MBB.begin(); 7289 MachineBasicBlock::iterator Et = MBB.end(); 7290 7291 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7292 OF.FrameConstructionID == MachineOutlinerThunk) 7293 Et = std::prev(MBB.end()); 7294 7295 // Insert a save before the outlined region 7296 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7297 .addReg(AArch64::SP, RegState::Define) 7298 .addReg(AArch64::LR) 7299 .addReg(AArch64::SP) 7300 .addImm(-16); 7301 It = MBB.insert(It, STRXpre); 7302 7303 const TargetSubtargetInfo &STI = MF.getSubtarget(); 7304 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 7305 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 7306 7307 // Add a CFI saying the stack was moved 16 B down. 7308 int64_t StackPosEntry = 7309 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 7310 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7311 .addCFIIndex(StackPosEntry) 7312 .setMIFlags(MachineInstr::FrameSetup); 7313 7314 // Add a CFI saying that the LR that we want to find is now 16 B higher than 7315 // before. 7316 int64_t LRPosEntry = 7317 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 7318 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7319 .addCFIIndex(LRPosEntry) 7320 .setMIFlags(MachineInstr::FrameSetup); 7321 7322 // Insert a restore before the terminator for the function. 7323 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7324 .addReg(AArch64::SP, RegState::Define) 7325 .addReg(AArch64::LR, RegState::Define) 7326 .addReg(AArch64::SP) 7327 .addImm(16); 7328 Et = MBB.insert(Et, LDRXpost); 7329 } 7330 7331 // If a bunch of candidates reach this point they must agree on their return 7332 // address signing. It is therefore enough to just consider the signing 7333 // behaviour of one of them 7334 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 7335 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 7336 7337 // a_key is the default 7338 bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); 7339 7340 // If this is a tail call outlined function, then there's already a return. 7341 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7342 OF.FrameConstructionID == MachineOutlinerThunk) { 7343 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7344 ShouldSignReturnAddrWithAKey); 7345 return; 7346 } 7347 7348 // It's not a tail call, so we have to insert the return ourselves. 7349 7350 // LR has to be a live in so that we can return to it. 7351 if (!MBB.isLiveIn(AArch64::LR)) 7352 MBB.addLiveIn(AArch64::LR); 7353 7354 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 7355 .addReg(AArch64::LR); 7356 MBB.insert(MBB.end(), ret); 7357 7358 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7359 ShouldSignReturnAddrWithAKey); 7360 7361 FI->setOutliningStyle("Function"); 7362 7363 // Did we have to modify the stack by saving the link register? 7364 if (OF.FrameConstructionID != MachineOutlinerDefault) 7365 return; 7366 7367 // We modified the stack. 7368 // Walk over the basic block and fix up all the stack accesses. 7369 fixupPostOutline(MBB); 7370 } 7371 7372 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 7373 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 7374 MachineFunction &MF, const outliner::Candidate &C) const { 7375 7376 // Are we tail calling? 7377 if (C.CallConstructionID == MachineOutlinerTailCall) { 7378 // If yes, then we can just branch to the label. 7379 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 7380 .addGlobalAddress(M.getNamedValue(MF.getName())) 7381 .addImm(0)); 7382 return It; 7383 } 7384 7385 // Are we saving the link register? 7386 if (C.CallConstructionID == MachineOutlinerNoLRSave || 7387 C.CallConstructionID == MachineOutlinerThunk) { 7388 // No, so just insert the call. 7389 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7390 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7391 return It; 7392 } 7393 7394 // We want to return the spot where we inserted the call. 7395 MachineBasicBlock::iterator CallPt; 7396 7397 // Instructions for saving and restoring LR around the call instruction we're 7398 // going to insert. 7399 MachineInstr *Save; 7400 MachineInstr *Restore; 7401 // Can we save to a register? 7402 if (C.CallConstructionID == MachineOutlinerRegSave) { 7403 // FIXME: This logic should be sunk into a target-specific interface so that 7404 // we don't have to recompute the register. 7405 unsigned Reg = findRegisterToSaveLRTo(C); 7406 assert(Reg != 0 && "No callee-saved register available?"); 7407 7408 // Save and restore LR from that register. 7409 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 7410 .addReg(AArch64::XZR) 7411 .addReg(AArch64::LR) 7412 .addImm(0); 7413 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 7414 .addReg(AArch64::XZR) 7415 .addReg(Reg) 7416 .addImm(0); 7417 } else { 7418 // We have the default case. Save and restore from SP. 7419 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7420 .addReg(AArch64::SP, RegState::Define) 7421 .addReg(AArch64::LR) 7422 .addReg(AArch64::SP) 7423 .addImm(-16); 7424 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7425 .addReg(AArch64::SP, RegState::Define) 7426 .addReg(AArch64::LR, RegState::Define) 7427 .addReg(AArch64::SP) 7428 .addImm(16); 7429 } 7430 7431 It = MBB.insert(It, Save); 7432 It++; 7433 7434 // Insert the call. 7435 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7436 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7437 CallPt = It; 7438 It++; 7439 7440 It = MBB.insert(It, Restore); 7441 return CallPt; 7442 } 7443 7444 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 7445 MachineFunction &MF) const { 7446 return MF.getFunction().hasMinSize(); 7447 } 7448 7449 Optional<DestSourcePair> 7450 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 7451 7452 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 7453 // and zero immediate operands used as an alias for mov instruction. 7454 if (MI.getOpcode() == AArch64::ORRWrs && 7455 MI.getOperand(1).getReg() == AArch64::WZR && 7456 MI.getOperand(3).getImm() == 0x0) { 7457 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7458 } 7459 7460 if (MI.getOpcode() == AArch64::ORRXrs && 7461 MI.getOperand(1).getReg() == AArch64::XZR && 7462 MI.getOperand(3).getImm() == 0x0) { 7463 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7464 } 7465 7466 return None; 7467 } 7468 7469 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 7470 Register Reg) const { 7471 int Sign = 1; 7472 int64_t Offset = 0; 7473 7474 // TODO: Handle cases where Reg is a super- or sub-register of the 7475 // destination register. 7476 const MachineOperand &Op0 = MI.getOperand(0); 7477 if (!Op0.isReg() || Reg != Op0.getReg()) 7478 return None; 7479 7480 switch (MI.getOpcode()) { 7481 default: 7482 return None; 7483 case AArch64::SUBWri: 7484 case AArch64::SUBXri: 7485 case AArch64::SUBSWri: 7486 case AArch64::SUBSXri: 7487 Sign *= -1; 7488 LLVM_FALLTHROUGH; 7489 case AArch64::ADDSWri: 7490 case AArch64::ADDSXri: 7491 case AArch64::ADDWri: 7492 case AArch64::ADDXri: { 7493 // TODO: Third operand can be global address (usually some string). 7494 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 7495 !MI.getOperand(2).isImm()) 7496 return None; 7497 int Shift = MI.getOperand(3).getImm(); 7498 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 7499 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 7500 } 7501 } 7502 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 7503 } 7504 7505 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 7506 /// the destination register then, if possible, describe the value in terms of 7507 /// the source register. 7508 static Optional<ParamLoadedValue> 7509 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 7510 const TargetInstrInfo *TII, 7511 const TargetRegisterInfo *TRI) { 7512 auto DestSrc = TII->isCopyInstr(MI); 7513 if (!DestSrc) 7514 return None; 7515 7516 Register DestReg = DestSrc->Destination->getReg(); 7517 Register SrcReg = DestSrc->Source->getReg(); 7518 7519 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 7520 7521 // If the described register is the destination, just return the source. 7522 if (DestReg == DescribedReg) 7523 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7524 7525 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 7526 if (MI.getOpcode() == AArch64::ORRWrs && 7527 TRI->isSuperRegister(DestReg, DescribedReg)) 7528 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7529 7530 // We may need to describe the lower part of a ORRXrs move. 7531 if (MI.getOpcode() == AArch64::ORRXrs && 7532 TRI->isSubRegister(DestReg, DescribedReg)) { 7533 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 7534 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 7535 } 7536 7537 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 7538 "Unhandled ORR[XW]rs copy case"); 7539 7540 return None; 7541 } 7542 7543 Optional<ParamLoadedValue> 7544 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 7545 Register Reg) const { 7546 const MachineFunction *MF = MI.getMF(); 7547 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 7548 switch (MI.getOpcode()) { 7549 case AArch64::MOVZWi: 7550 case AArch64::MOVZXi: { 7551 // MOVZWi may be used for producing zero-extended 32-bit immediates in 7552 // 64-bit parameters, so we need to consider super-registers. 7553 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 7554 return None; 7555 7556 if (!MI.getOperand(1).isImm()) 7557 return None; 7558 int64_t Immediate = MI.getOperand(1).getImm(); 7559 int Shift = MI.getOperand(2).getImm(); 7560 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 7561 nullptr); 7562 } 7563 case AArch64::ORRWrs: 7564 case AArch64::ORRXrs: 7565 return describeORRLoadedValue(MI, Reg, this, TRI); 7566 } 7567 7568 return TargetInstrInfo::describeLoadedValue(MI, Reg); 7569 } 7570 7571 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 7572 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 7573 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 7574 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 7575 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 7576 7577 // Anyexts are nops. 7578 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 7579 return true; 7580 7581 Register DefReg = ExtMI.getOperand(0).getReg(); 7582 if (!MRI.hasOneNonDBGUse(DefReg)) 7583 return false; 7584 7585 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 7586 // addressing mode. 7587 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 7588 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 7589 } 7590 7591 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 7592 return get(Opc).TSFlags & AArch64::ElementSizeMask; 7593 } 7594 7595 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 7596 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 7597 } 7598 7599 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 7600 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 7601 } 7602 7603 unsigned int 7604 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 7605 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 7606 } 7607 7608 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 7609 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 7610 return AArch64::BLRNoIP; 7611 else 7612 return AArch64::BLR; 7613 } 7614 7615 #define GET_INSTRINFO_HELPERS 7616 #define GET_INSTRMAP_INFO 7617 #include "AArch64GenInstrInfo.inc" 7618