1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineModuleInfo.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstrDesc.h" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CodeGen.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/Compiler.h" 43 #include "llvm/Support/ErrorHandling.h" 44 #include "llvm/Support/MathExtras.h" 45 #include "llvm/Target/TargetMachine.h" 46 #include "llvm/Target/TargetOptions.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <iterator> 50 #include <utility> 51 52 using namespace llvm; 53 54 #define GET_INSTRINFO_CTOR_DTOR 55 #include "AArch64GenInstrInfo.inc" 56 57 static cl::opt<unsigned> TBZDisplacementBits( 58 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 59 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 60 61 static cl::opt<unsigned> CBZDisplacementBits( 62 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 63 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 64 65 static cl::opt<unsigned> 66 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 67 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 68 69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 70 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 71 AArch64::CATCHRET), 72 RI(STI.getTargetTriple()), Subtarget(STI) {} 73 74 /// GetInstSize - Return the number of bytes of code the specified 75 /// instruction may be. This returns the maximum number of bytes. 76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 77 const MachineBasicBlock &MBB = *MI.getParent(); 78 const MachineFunction *MF = MBB.getParent(); 79 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 80 81 { 82 auto Op = MI.getOpcode(); 83 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 84 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 85 } 86 87 // Meta-instructions emit no code. 88 if (MI.isMetaInstruction()) 89 return 0; 90 91 // FIXME: We currently only handle pseudoinstructions that don't get expanded 92 // before the assembly printer. 93 unsigned NumBytes = 0; 94 const MCInstrDesc &Desc = MI.getDesc(); 95 switch (Desc.getOpcode()) { 96 default: 97 // Anything not explicitly designated otherwise is a normal 4-byte insn. 98 NumBytes = 4; 99 break; 100 case TargetOpcode::STACKMAP: 101 // The upper bound for a stackmap intrinsic is the full length of its shadow 102 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 103 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 104 break; 105 case TargetOpcode::PATCHPOINT: 106 // The size of the patchpoint intrinsic is the number of bytes requested 107 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 108 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 109 break; 110 case TargetOpcode::STATEPOINT: 111 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 112 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 113 // No patch bytes means a normal call inst is emitted 114 if (NumBytes == 0) 115 NumBytes = 4; 116 break; 117 case AArch64::TLSDESC_CALLSEQ: 118 // This gets lowered to an instruction sequence which takes 16 bytes 119 NumBytes = 16; 120 break; 121 case AArch64::SpeculationBarrierISBDSBEndBB: 122 // This gets lowered to 2 4-byte instructions. 123 NumBytes = 8; 124 break; 125 case AArch64::SpeculationBarrierSBEndBB: 126 // This gets lowered to 1 4-byte instructions. 127 NumBytes = 4; 128 break; 129 case AArch64::JumpTableDest32: 130 case AArch64::JumpTableDest16: 131 case AArch64::JumpTableDest8: 132 NumBytes = 12; 133 break; 134 case AArch64::SPACE: 135 NumBytes = MI.getOperand(1).getImm(); 136 break; 137 case TargetOpcode::BUNDLE: 138 NumBytes = getInstBundleLength(MI); 139 break; 140 } 141 142 return NumBytes; 143 } 144 145 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 146 unsigned Size = 0; 147 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 148 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 149 while (++I != E && I->isInsideBundle()) { 150 assert(!I->isBundle() && "No nested bundle!"); 151 Size += getInstSizeInBytes(*I); 152 } 153 return Size; 154 } 155 156 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 157 SmallVectorImpl<MachineOperand> &Cond) { 158 // Block ends with fall-through condbranch. 159 switch (LastInst->getOpcode()) { 160 default: 161 llvm_unreachable("Unknown branch instruction?"); 162 case AArch64::Bcc: 163 Target = LastInst->getOperand(1).getMBB(); 164 Cond.push_back(LastInst->getOperand(0)); 165 break; 166 case AArch64::CBZW: 167 case AArch64::CBZX: 168 case AArch64::CBNZW: 169 case AArch64::CBNZX: 170 Target = LastInst->getOperand(1).getMBB(); 171 Cond.push_back(MachineOperand::CreateImm(-1)); 172 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 173 Cond.push_back(LastInst->getOperand(0)); 174 break; 175 case AArch64::TBZW: 176 case AArch64::TBZX: 177 case AArch64::TBNZW: 178 case AArch64::TBNZX: 179 Target = LastInst->getOperand(2).getMBB(); 180 Cond.push_back(MachineOperand::CreateImm(-1)); 181 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 182 Cond.push_back(LastInst->getOperand(0)); 183 Cond.push_back(LastInst->getOperand(1)); 184 } 185 } 186 187 static unsigned getBranchDisplacementBits(unsigned Opc) { 188 switch (Opc) { 189 default: 190 llvm_unreachable("unexpected opcode!"); 191 case AArch64::B: 192 return 64; 193 case AArch64::TBNZW: 194 case AArch64::TBZW: 195 case AArch64::TBNZX: 196 case AArch64::TBZX: 197 return TBZDisplacementBits; 198 case AArch64::CBNZW: 199 case AArch64::CBZW: 200 case AArch64::CBNZX: 201 case AArch64::CBZX: 202 return CBZDisplacementBits; 203 case AArch64::Bcc: 204 return BCCDisplacementBits; 205 } 206 } 207 208 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 209 int64_t BrOffset) const { 210 unsigned Bits = getBranchDisplacementBits(BranchOp); 211 assert(Bits >= 3 && "max branch displacement must be enough to jump" 212 "over conditional branch expansion"); 213 return isIntN(Bits, BrOffset / 4); 214 } 215 216 MachineBasicBlock * 217 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 218 switch (MI.getOpcode()) { 219 default: 220 llvm_unreachable("unexpected opcode!"); 221 case AArch64::B: 222 return MI.getOperand(0).getMBB(); 223 case AArch64::TBZW: 224 case AArch64::TBNZW: 225 case AArch64::TBZX: 226 case AArch64::TBNZX: 227 return MI.getOperand(2).getMBB(); 228 case AArch64::CBZW: 229 case AArch64::CBNZW: 230 case AArch64::CBZX: 231 case AArch64::CBNZX: 232 case AArch64::Bcc: 233 return MI.getOperand(1).getMBB(); 234 } 235 } 236 237 // Branch analysis. 238 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 239 MachineBasicBlock *&TBB, 240 MachineBasicBlock *&FBB, 241 SmallVectorImpl<MachineOperand> &Cond, 242 bool AllowModify) const { 243 // If the block has no terminators, it just falls into the block after it. 244 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 245 if (I == MBB.end()) 246 return false; 247 248 // Skip over SpeculationBarrierEndBB terminators 249 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 250 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 251 --I; 252 } 253 254 if (!isUnpredicatedTerminator(*I)) 255 return false; 256 257 // Get the last instruction in the block. 258 MachineInstr *LastInst = &*I; 259 260 // If there is only one terminator instruction, process it. 261 unsigned LastOpc = LastInst->getOpcode(); 262 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 263 if (isUncondBranchOpcode(LastOpc)) { 264 TBB = LastInst->getOperand(0).getMBB(); 265 return false; 266 } 267 if (isCondBranchOpcode(LastOpc)) { 268 // Block ends with fall-through condbranch. 269 parseCondBranch(LastInst, TBB, Cond); 270 return false; 271 } 272 return true; // Can't handle indirect branch. 273 } 274 275 // Get the instruction before it if it is a terminator. 276 MachineInstr *SecondLastInst = &*I; 277 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 278 279 // If AllowModify is true and the block ends with two or more unconditional 280 // branches, delete all but the first unconditional branch. 281 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 282 while (isUncondBranchOpcode(SecondLastOpc)) { 283 LastInst->eraseFromParent(); 284 LastInst = SecondLastInst; 285 LastOpc = LastInst->getOpcode(); 286 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 287 // Return now the only terminator is an unconditional branch. 288 TBB = LastInst->getOperand(0).getMBB(); 289 return false; 290 } else { 291 SecondLastInst = &*I; 292 SecondLastOpc = SecondLastInst->getOpcode(); 293 } 294 } 295 } 296 297 // If we're allowed to modify and the block ends in a unconditional branch 298 // which could simply fallthrough, remove the branch. (Note: This case only 299 // matters when we can't understand the whole sequence, otherwise it's also 300 // handled by BranchFolding.cpp.) 301 if (AllowModify && isUncondBranchOpcode(LastOpc) && 302 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 303 LastInst->eraseFromParent(); 304 LastInst = SecondLastInst; 305 LastOpc = LastInst->getOpcode(); 306 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 307 assert(!isUncondBranchOpcode(LastOpc) && 308 "unreachable unconditional branches removed above"); 309 310 if (isCondBranchOpcode(LastOpc)) { 311 // Block ends with fall-through condbranch. 312 parseCondBranch(LastInst, TBB, Cond); 313 return false; 314 } 315 return true; // Can't handle indirect branch. 316 } else { 317 SecondLastInst = &*I; 318 SecondLastOpc = SecondLastInst->getOpcode(); 319 } 320 } 321 322 // If there are three terminators, we don't know what sort of block this is. 323 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 324 return true; 325 326 // If the block ends with a B and a Bcc, handle it. 327 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 328 parseCondBranch(SecondLastInst, TBB, Cond); 329 FBB = LastInst->getOperand(0).getMBB(); 330 return false; 331 } 332 333 // If the block ends with two unconditional branches, handle it. The second 334 // one is not executed, so remove it. 335 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 336 TBB = SecondLastInst->getOperand(0).getMBB(); 337 I = LastInst; 338 if (AllowModify) 339 I->eraseFromParent(); 340 return false; 341 } 342 343 // ...likewise if it ends with an indirect branch followed by an unconditional 344 // branch. 345 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 346 I = LastInst; 347 if (AllowModify) 348 I->eraseFromParent(); 349 return true; 350 } 351 352 // Otherwise, can't handle this. 353 return true; 354 } 355 356 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 357 MachineBranchPredicate &MBP, 358 bool AllowModify) const { 359 // For the moment, handle only a block which ends with a cb(n)zx followed by 360 // a fallthrough. Why this? Because it is a common form. 361 // TODO: Should we handle b.cc? 362 363 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 364 if (I == MBB.end()) 365 return true; 366 367 // Skip over SpeculationBarrierEndBB terminators 368 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 369 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 370 --I; 371 } 372 373 if (!isUnpredicatedTerminator(*I)) 374 return true; 375 376 // Get the last instruction in the block. 377 MachineInstr *LastInst = &*I; 378 unsigned LastOpc = LastInst->getOpcode(); 379 if (!isCondBranchOpcode(LastOpc)) 380 return true; 381 382 switch (LastOpc) { 383 default: 384 return true; 385 case AArch64::CBZW: 386 case AArch64::CBZX: 387 case AArch64::CBNZW: 388 case AArch64::CBNZX: 389 break; 390 }; 391 392 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 393 assert(MBP.TrueDest && "expected!"); 394 MBP.FalseDest = MBB.getNextNode(); 395 396 MBP.ConditionDef = nullptr; 397 MBP.SingleUseCondition = false; 398 399 MBP.LHS = LastInst->getOperand(0); 400 MBP.RHS = MachineOperand::CreateImm(0); 401 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 402 : MachineBranchPredicate::PRED_EQ; 403 return false; 404 } 405 406 bool AArch64InstrInfo::reverseBranchCondition( 407 SmallVectorImpl<MachineOperand> &Cond) const { 408 if (Cond[0].getImm() != -1) { 409 // Regular Bcc 410 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 411 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 412 } else { 413 // Folded compare-and-branch 414 switch (Cond[1].getImm()) { 415 default: 416 llvm_unreachable("Unknown conditional branch!"); 417 case AArch64::CBZW: 418 Cond[1].setImm(AArch64::CBNZW); 419 break; 420 case AArch64::CBNZW: 421 Cond[1].setImm(AArch64::CBZW); 422 break; 423 case AArch64::CBZX: 424 Cond[1].setImm(AArch64::CBNZX); 425 break; 426 case AArch64::CBNZX: 427 Cond[1].setImm(AArch64::CBZX); 428 break; 429 case AArch64::TBZW: 430 Cond[1].setImm(AArch64::TBNZW); 431 break; 432 case AArch64::TBNZW: 433 Cond[1].setImm(AArch64::TBZW); 434 break; 435 case AArch64::TBZX: 436 Cond[1].setImm(AArch64::TBNZX); 437 break; 438 case AArch64::TBNZX: 439 Cond[1].setImm(AArch64::TBZX); 440 break; 441 } 442 } 443 444 return false; 445 } 446 447 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 448 int *BytesRemoved) const { 449 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 450 if (I == MBB.end()) 451 return 0; 452 453 if (!isUncondBranchOpcode(I->getOpcode()) && 454 !isCondBranchOpcode(I->getOpcode())) 455 return 0; 456 457 // Remove the branch. 458 I->eraseFromParent(); 459 460 I = MBB.end(); 461 462 if (I == MBB.begin()) { 463 if (BytesRemoved) 464 *BytesRemoved = 4; 465 return 1; 466 } 467 --I; 468 if (!isCondBranchOpcode(I->getOpcode())) { 469 if (BytesRemoved) 470 *BytesRemoved = 4; 471 return 1; 472 } 473 474 // Remove the branch. 475 I->eraseFromParent(); 476 if (BytesRemoved) 477 *BytesRemoved = 8; 478 479 return 2; 480 } 481 482 void AArch64InstrInfo::instantiateCondBranch( 483 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 484 ArrayRef<MachineOperand> Cond) const { 485 if (Cond[0].getImm() != -1) { 486 // Regular Bcc 487 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 488 } else { 489 // Folded compare-and-branch 490 // Note that we use addOperand instead of addReg to keep the flags. 491 const MachineInstrBuilder MIB = 492 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 493 if (Cond.size() > 3) 494 MIB.addImm(Cond[3].getImm()); 495 MIB.addMBB(TBB); 496 } 497 } 498 499 unsigned AArch64InstrInfo::insertBranch( 500 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 501 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 502 // Shouldn't be a fall through. 503 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 504 505 if (!FBB) { 506 if (Cond.empty()) // Unconditional branch? 507 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 508 else 509 instantiateCondBranch(MBB, DL, TBB, Cond); 510 511 if (BytesAdded) 512 *BytesAdded = 4; 513 514 return 1; 515 } 516 517 // Two-way conditional branch. 518 instantiateCondBranch(MBB, DL, TBB, Cond); 519 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 520 521 if (BytesAdded) 522 *BytesAdded = 8; 523 524 return 2; 525 } 526 527 // Find the original register that VReg is copied from. 528 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 529 while (Register::isVirtualRegister(VReg)) { 530 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 531 if (!DefMI->isFullCopy()) 532 return VReg; 533 VReg = DefMI->getOperand(1).getReg(); 534 } 535 return VReg; 536 } 537 538 // Determine if VReg is defined by an instruction that can be folded into a 539 // csel instruction. If so, return the folded opcode, and the replacement 540 // register. 541 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 542 unsigned *NewVReg = nullptr) { 543 VReg = removeCopies(MRI, VReg); 544 if (!Register::isVirtualRegister(VReg)) 545 return 0; 546 547 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 548 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 549 unsigned Opc = 0; 550 unsigned SrcOpNum = 0; 551 switch (DefMI->getOpcode()) { 552 case AArch64::ADDSXri: 553 case AArch64::ADDSWri: 554 // if NZCV is used, do not fold. 555 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 556 return 0; 557 // fall-through to ADDXri and ADDWri. 558 LLVM_FALLTHROUGH; 559 case AArch64::ADDXri: 560 case AArch64::ADDWri: 561 // add x, 1 -> csinc. 562 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 563 DefMI->getOperand(3).getImm() != 0) 564 return 0; 565 SrcOpNum = 1; 566 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 567 break; 568 569 case AArch64::ORNXrr: 570 case AArch64::ORNWrr: { 571 // not x -> csinv, represented as orn dst, xzr, src. 572 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 573 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 574 return 0; 575 SrcOpNum = 2; 576 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 577 break; 578 } 579 580 case AArch64::SUBSXrr: 581 case AArch64::SUBSWrr: 582 // if NZCV is used, do not fold. 583 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 584 return 0; 585 // fall-through to SUBXrr and SUBWrr. 586 LLVM_FALLTHROUGH; 587 case AArch64::SUBXrr: 588 case AArch64::SUBWrr: { 589 // neg x -> csneg, represented as sub dst, xzr, src. 590 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 591 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 592 return 0; 593 SrcOpNum = 2; 594 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 595 break; 596 } 597 default: 598 return 0; 599 } 600 assert(Opc && SrcOpNum && "Missing parameters"); 601 602 if (NewVReg) 603 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 604 return Opc; 605 } 606 607 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 608 ArrayRef<MachineOperand> Cond, 609 Register DstReg, Register TrueReg, 610 Register FalseReg, int &CondCycles, 611 int &TrueCycles, 612 int &FalseCycles) const { 613 // Check register classes. 614 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 615 const TargetRegisterClass *RC = 616 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 617 if (!RC) 618 return false; 619 620 // Also need to check the dest regclass, in case we're trying to optimize 621 // something like: 622 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 623 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 624 return false; 625 626 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 627 unsigned ExtraCondLat = Cond.size() != 1; 628 629 // GPRs are handled by csel. 630 // FIXME: Fold in x+1, -x, and ~x when applicable. 631 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 632 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 633 // Single-cycle csel, csinc, csinv, and csneg. 634 CondCycles = 1 + ExtraCondLat; 635 TrueCycles = FalseCycles = 1; 636 if (canFoldIntoCSel(MRI, TrueReg)) 637 TrueCycles = 0; 638 else if (canFoldIntoCSel(MRI, FalseReg)) 639 FalseCycles = 0; 640 return true; 641 } 642 643 // Scalar floating point is handled by fcsel. 644 // FIXME: Form fabs, fmin, and fmax when applicable. 645 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 646 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 647 CondCycles = 5 + ExtraCondLat; 648 TrueCycles = FalseCycles = 2; 649 return true; 650 } 651 652 // Can't do vectors. 653 return false; 654 } 655 656 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 657 MachineBasicBlock::iterator I, 658 const DebugLoc &DL, Register DstReg, 659 ArrayRef<MachineOperand> Cond, 660 Register TrueReg, Register FalseReg) const { 661 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 662 663 // Parse the condition code, see parseCondBranch() above. 664 AArch64CC::CondCode CC; 665 switch (Cond.size()) { 666 default: 667 llvm_unreachable("Unknown condition opcode in Cond"); 668 case 1: // b.cc 669 CC = AArch64CC::CondCode(Cond[0].getImm()); 670 break; 671 case 3: { // cbz/cbnz 672 // We must insert a compare against 0. 673 bool Is64Bit; 674 switch (Cond[1].getImm()) { 675 default: 676 llvm_unreachable("Unknown branch opcode in Cond"); 677 case AArch64::CBZW: 678 Is64Bit = false; 679 CC = AArch64CC::EQ; 680 break; 681 case AArch64::CBZX: 682 Is64Bit = true; 683 CC = AArch64CC::EQ; 684 break; 685 case AArch64::CBNZW: 686 Is64Bit = false; 687 CC = AArch64CC::NE; 688 break; 689 case AArch64::CBNZX: 690 Is64Bit = true; 691 CC = AArch64CC::NE; 692 break; 693 } 694 Register SrcReg = Cond[2].getReg(); 695 if (Is64Bit) { 696 // cmp reg, #0 is actually subs xzr, reg, #0. 697 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 698 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 699 .addReg(SrcReg) 700 .addImm(0) 701 .addImm(0); 702 } else { 703 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 704 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 705 .addReg(SrcReg) 706 .addImm(0) 707 .addImm(0); 708 } 709 break; 710 } 711 case 4: { // tbz/tbnz 712 // We must insert a tst instruction. 713 switch (Cond[1].getImm()) { 714 default: 715 llvm_unreachable("Unknown branch opcode in Cond"); 716 case AArch64::TBZW: 717 case AArch64::TBZX: 718 CC = AArch64CC::EQ; 719 break; 720 case AArch64::TBNZW: 721 case AArch64::TBNZX: 722 CC = AArch64CC::NE; 723 break; 724 } 725 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 726 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 727 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 728 .addReg(Cond[2].getReg()) 729 .addImm( 730 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 731 else 732 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 733 .addReg(Cond[2].getReg()) 734 .addImm( 735 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 736 break; 737 } 738 } 739 740 unsigned Opc = 0; 741 const TargetRegisterClass *RC = nullptr; 742 bool TryFold = false; 743 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 744 RC = &AArch64::GPR64RegClass; 745 Opc = AArch64::CSELXr; 746 TryFold = true; 747 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 748 RC = &AArch64::GPR32RegClass; 749 Opc = AArch64::CSELWr; 750 TryFold = true; 751 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 752 RC = &AArch64::FPR64RegClass; 753 Opc = AArch64::FCSELDrrr; 754 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 755 RC = &AArch64::FPR32RegClass; 756 Opc = AArch64::FCSELSrrr; 757 } 758 assert(RC && "Unsupported regclass"); 759 760 // Try folding simple instructions into the csel. 761 if (TryFold) { 762 unsigned NewVReg = 0; 763 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 764 if (FoldedOpc) { 765 // The folded opcodes csinc, csinc and csneg apply the operation to 766 // FalseReg, so we need to invert the condition. 767 CC = AArch64CC::getInvertedCondCode(CC); 768 TrueReg = FalseReg; 769 } else 770 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 771 772 // Fold the operation. Leave any dead instructions for DCE to clean up. 773 if (FoldedOpc) { 774 FalseReg = NewVReg; 775 Opc = FoldedOpc; 776 // The extends the live range of NewVReg. 777 MRI.clearKillFlags(NewVReg); 778 } 779 } 780 781 // Pull all virtual register into the appropriate class. 782 MRI.constrainRegClass(TrueReg, RC); 783 MRI.constrainRegClass(FalseReg, RC); 784 785 // Insert the csel. 786 BuildMI(MBB, I, DL, get(Opc), DstReg) 787 .addReg(TrueReg) 788 .addReg(FalseReg) 789 .addImm(CC); 790 } 791 792 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 793 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 794 uint64_t Imm = MI.getOperand(1).getImm(); 795 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 796 uint64_t Encoding; 797 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 798 } 799 800 // FIXME: this implementation should be micro-architecture dependent, so a 801 // micro-architecture target hook should be introduced here in future. 802 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 803 if (!Subtarget.hasCustomCheapAsMoveHandling()) 804 return MI.isAsCheapAsAMove(); 805 806 const unsigned Opcode = MI.getOpcode(); 807 808 // Firstly, check cases gated by features. 809 810 if (Subtarget.hasZeroCycleZeroingFP()) { 811 if (Opcode == AArch64::FMOVH0 || 812 Opcode == AArch64::FMOVS0 || 813 Opcode == AArch64::FMOVD0) 814 return true; 815 } 816 817 if (Subtarget.hasZeroCycleZeroingGP()) { 818 if (Opcode == TargetOpcode::COPY && 819 (MI.getOperand(1).getReg() == AArch64::WZR || 820 MI.getOperand(1).getReg() == AArch64::XZR)) 821 return true; 822 } 823 824 // Secondly, check cases specific to sub-targets. 825 826 if (Subtarget.hasExynosCheapAsMoveHandling()) { 827 if (isExynosCheapAsMove(MI)) 828 return true; 829 830 return MI.isAsCheapAsAMove(); 831 } 832 833 // Finally, check generic cases. 834 835 switch (Opcode) { 836 default: 837 return false; 838 839 // add/sub on register without shift 840 case AArch64::ADDWri: 841 case AArch64::ADDXri: 842 case AArch64::SUBWri: 843 case AArch64::SUBXri: 844 return (MI.getOperand(3).getImm() == 0); 845 846 // logical ops on immediate 847 case AArch64::ANDWri: 848 case AArch64::ANDXri: 849 case AArch64::EORWri: 850 case AArch64::EORXri: 851 case AArch64::ORRWri: 852 case AArch64::ORRXri: 853 return true; 854 855 // logical ops on register without shift 856 case AArch64::ANDWrr: 857 case AArch64::ANDXrr: 858 case AArch64::BICWrr: 859 case AArch64::BICXrr: 860 case AArch64::EONWrr: 861 case AArch64::EONXrr: 862 case AArch64::EORWrr: 863 case AArch64::EORXrr: 864 case AArch64::ORNWrr: 865 case AArch64::ORNXrr: 866 case AArch64::ORRWrr: 867 case AArch64::ORRXrr: 868 return true; 869 870 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 871 // ORRXri, it is as cheap as MOV 872 case AArch64::MOVi32imm: 873 return canBeExpandedToORR(MI, 32); 874 case AArch64::MOVi64imm: 875 return canBeExpandedToORR(MI, 64); 876 } 877 878 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 879 } 880 881 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 882 switch (MI.getOpcode()) { 883 default: 884 return false; 885 886 case AArch64::ADDWrs: 887 case AArch64::ADDXrs: 888 case AArch64::ADDSWrs: 889 case AArch64::ADDSXrs: { 890 unsigned Imm = MI.getOperand(3).getImm(); 891 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 892 if (ShiftVal == 0) 893 return true; 894 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 895 } 896 897 case AArch64::ADDWrx: 898 case AArch64::ADDXrx: 899 case AArch64::ADDXrx64: 900 case AArch64::ADDSWrx: 901 case AArch64::ADDSXrx: 902 case AArch64::ADDSXrx64: { 903 unsigned Imm = MI.getOperand(3).getImm(); 904 switch (AArch64_AM::getArithExtendType(Imm)) { 905 default: 906 return false; 907 case AArch64_AM::UXTB: 908 case AArch64_AM::UXTH: 909 case AArch64_AM::UXTW: 910 case AArch64_AM::UXTX: 911 return AArch64_AM::getArithShiftValue(Imm) <= 4; 912 } 913 } 914 915 case AArch64::SUBWrs: 916 case AArch64::SUBSWrs: { 917 unsigned Imm = MI.getOperand(3).getImm(); 918 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 919 return ShiftVal == 0 || 920 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 921 } 922 923 case AArch64::SUBXrs: 924 case AArch64::SUBSXrs: { 925 unsigned Imm = MI.getOperand(3).getImm(); 926 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 927 return ShiftVal == 0 || 928 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 929 } 930 931 case AArch64::SUBWrx: 932 case AArch64::SUBXrx: 933 case AArch64::SUBXrx64: 934 case AArch64::SUBSWrx: 935 case AArch64::SUBSXrx: 936 case AArch64::SUBSXrx64: { 937 unsigned Imm = MI.getOperand(3).getImm(); 938 switch (AArch64_AM::getArithExtendType(Imm)) { 939 default: 940 return false; 941 case AArch64_AM::UXTB: 942 case AArch64_AM::UXTH: 943 case AArch64_AM::UXTW: 944 case AArch64_AM::UXTX: 945 return AArch64_AM::getArithShiftValue(Imm) == 0; 946 } 947 } 948 949 case AArch64::LDRBBroW: 950 case AArch64::LDRBBroX: 951 case AArch64::LDRBroW: 952 case AArch64::LDRBroX: 953 case AArch64::LDRDroW: 954 case AArch64::LDRDroX: 955 case AArch64::LDRHHroW: 956 case AArch64::LDRHHroX: 957 case AArch64::LDRHroW: 958 case AArch64::LDRHroX: 959 case AArch64::LDRQroW: 960 case AArch64::LDRQroX: 961 case AArch64::LDRSBWroW: 962 case AArch64::LDRSBWroX: 963 case AArch64::LDRSBXroW: 964 case AArch64::LDRSBXroX: 965 case AArch64::LDRSHWroW: 966 case AArch64::LDRSHWroX: 967 case AArch64::LDRSHXroW: 968 case AArch64::LDRSHXroX: 969 case AArch64::LDRSWroW: 970 case AArch64::LDRSWroX: 971 case AArch64::LDRSroW: 972 case AArch64::LDRSroX: 973 case AArch64::LDRWroW: 974 case AArch64::LDRWroX: 975 case AArch64::LDRXroW: 976 case AArch64::LDRXroX: 977 case AArch64::PRFMroW: 978 case AArch64::PRFMroX: 979 case AArch64::STRBBroW: 980 case AArch64::STRBBroX: 981 case AArch64::STRBroW: 982 case AArch64::STRBroX: 983 case AArch64::STRDroW: 984 case AArch64::STRDroX: 985 case AArch64::STRHHroW: 986 case AArch64::STRHHroX: 987 case AArch64::STRHroW: 988 case AArch64::STRHroX: 989 case AArch64::STRQroW: 990 case AArch64::STRQroX: 991 case AArch64::STRSroW: 992 case AArch64::STRSroX: 993 case AArch64::STRWroW: 994 case AArch64::STRWroX: 995 case AArch64::STRXroW: 996 case AArch64::STRXroX: { 997 unsigned IsSigned = MI.getOperand(3).getImm(); 998 return !IsSigned; 999 } 1000 } 1001 } 1002 1003 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1004 unsigned Opc = MI.getOpcode(); 1005 switch (Opc) { 1006 default: 1007 return false; 1008 case AArch64::SEH_StackAlloc: 1009 case AArch64::SEH_SaveFPLR: 1010 case AArch64::SEH_SaveFPLR_X: 1011 case AArch64::SEH_SaveReg: 1012 case AArch64::SEH_SaveReg_X: 1013 case AArch64::SEH_SaveRegP: 1014 case AArch64::SEH_SaveRegP_X: 1015 case AArch64::SEH_SaveFReg: 1016 case AArch64::SEH_SaveFReg_X: 1017 case AArch64::SEH_SaveFRegP: 1018 case AArch64::SEH_SaveFRegP_X: 1019 case AArch64::SEH_SetFP: 1020 case AArch64::SEH_AddFP: 1021 case AArch64::SEH_Nop: 1022 case AArch64::SEH_PrologEnd: 1023 case AArch64::SEH_EpilogStart: 1024 case AArch64::SEH_EpilogEnd: 1025 return true; 1026 } 1027 } 1028 1029 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1030 Register &SrcReg, Register &DstReg, 1031 unsigned &SubIdx) const { 1032 switch (MI.getOpcode()) { 1033 default: 1034 return false; 1035 case AArch64::SBFMXri: // aka sxtw 1036 case AArch64::UBFMXri: // aka uxtw 1037 // Check for the 32 -> 64 bit extension case, these instructions can do 1038 // much more. 1039 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1040 return false; 1041 // This is a signed or unsigned 32 -> 64 bit extension. 1042 SrcReg = MI.getOperand(1).getReg(); 1043 DstReg = MI.getOperand(0).getReg(); 1044 SubIdx = AArch64::sub_32; 1045 return true; 1046 } 1047 } 1048 1049 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1050 const MachineInstr &MIa, const MachineInstr &MIb) const { 1051 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1052 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1053 int64_t OffsetA = 0, OffsetB = 0; 1054 unsigned WidthA = 0, WidthB = 0; 1055 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1056 1057 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1058 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1059 1060 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1061 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1062 return false; 1063 1064 // Retrieve the base, offset from the base and width. Width 1065 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1066 // base are identical, and the offset of a lower memory access + 1067 // the width doesn't overlap the offset of a higher memory access, 1068 // then the memory accesses are different. 1069 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1070 // are assumed to have the same scale (vscale). 1071 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1072 WidthA, TRI) && 1073 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1074 WidthB, TRI)) { 1075 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1076 OffsetAIsScalable == OffsetBIsScalable) { 1077 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1078 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1079 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1080 if (LowOffset + LowWidth <= HighOffset) 1081 return true; 1082 } 1083 } 1084 return false; 1085 } 1086 1087 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1088 const MachineBasicBlock *MBB, 1089 const MachineFunction &MF) const { 1090 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1091 return true; 1092 switch (MI.getOpcode()) { 1093 case AArch64::HINT: 1094 // CSDB hints are scheduling barriers. 1095 if (MI.getOperand(0).getImm() == 0x14) 1096 return true; 1097 break; 1098 case AArch64::DSB: 1099 case AArch64::ISB: 1100 // DSB and ISB also are scheduling barriers. 1101 return true; 1102 default:; 1103 } 1104 return isSEHInstruction(MI); 1105 } 1106 1107 /// analyzeCompare - For a comparison instruction, return the source registers 1108 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1109 /// Return true if the comparison instruction can be analyzed. 1110 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1111 Register &SrcReg2, int &CmpMask, 1112 int &CmpValue) const { 1113 // The first operand can be a frame index where we'd normally expect a 1114 // register. 1115 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1116 if (!MI.getOperand(1).isReg()) 1117 return false; 1118 1119 switch (MI.getOpcode()) { 1120 default: 1121 break; 1122 case AArch64::PTEST_PP: 1123 SrcReg = MI.getOperand(0).getReg(); 1124 SrcReg2 = MI.getOperand(1).getReg(); 1125 // Not sure about the mask and value for now... 1126 CmpMask = ~0; 1127 CmpValue = 0; 1128 return true; 1129 case AArch64::SUBSWrr: 1130 case AArch64::SUBSWrs: 1131 case AArch64::SUBSWrx: 1132 case AArch64::SUBSXrr: 1133 case AArch64::SUBSXrs: 1134 case AArch64::SUBSXrx: 1135 case AArch64::ADDSWrr: 1136 case AArch64::ADDSWrs: 1137 case AArch64::ADDSWrx: 1138 case AArch64::ADDSXrr: 1139 case AArch64::ADDSXrs: 1140 case AArch64::ADDSXrx: 1141 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1142 SrcReg = MI.getOperand(1).getReg(); 1143 SrcReg2 = MI.getOperand(2).getReg(); 1144 CmpMask = ~0; 1145 CmpValue = 0; 1146 return true; 1147 case AArch64::SUBSWri: 1148 case AArch64::ADDSWri: 1149 case AArch64::SUBSXri: 1150 case AArch64::ADDSXri: 1151 SrcReg = MI.getOperand(1).getReg(); 1152 SrcReg2 = 0; 1153 CmpMask = ~0; 1154 // FIXME: In order to convert CmpValue to 0 or 1 1155 CmpValue = MI.getOperand(2).getImm() != 0; 1156 return true; 1157 case AArch64::ANDSWri: 1158 case AArch64::ANDSXri: 1159 // ANDS does not use the same encoding scheme as the others xxxS 1160 // instructions. 1161 SrcReg = MI.getOperand(1).getReg(); 1162 SrcReg2 = 0; 1163 CmpMask = ~0; 1164 // FIXME:The return val type of decodeLogicalImmediate is uint64_t, 1165 // while the type of CmpValue is int. When converting uint64_t to int, 1166 // the high 32 bits of uint64_t will be lost. 1167 // In fact it causes a bug in spec2006-483.xalancbmk 1168 // CmpValue is only used to compare with zero in OptimizeCompareInstr 1169 CmpValue = AArch64_AM::decodeLogicalImmediate( 1170 MI.getOperand(2).getImm(), 1171 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; 1172 return true; 1173 } 1174 1175 return false; 1176 } 1177 1178 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1179 MachineBasicBlock *MBB = Instr.getParent(); 1180 assert(MBB && "Can't get MachineBasicBlock here"); 1181 MachineFunction *MF = MBB->getParent(); 1182 assert(MF && "Can't get MachineFunction here"); 1183 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1184 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1185 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1186 1187 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1188 ++OpIdx) { 1189 MachineOperand &MO = Instr.getOperand(OpIdx); 1190 const TargetRegisterClass *OpRegCstraints = 1191 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1192 1193 // If there's no constraint, there's nothing to do. 1194 if (!OpRegCstraints) 1195 continue; 1196 // If the operand is a frame index, there's nothing to do here. 1197 // A frame index operand will resolve correctly during PEI. 1198 if (MO.isFI()) 1199 continue; 1200 1201 assert(MO.isReg() && 1202 "Operand has register constraints without being a register!"); 1203 1204 Register Reg = MO.getReg(); 1205 if (Register::isPhysicalRegister(Reg)) { 1206 if (!OpRegCstraints->contains(Reg)) 1207 return false; 1208 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1209 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1210 return false; 1211 } 1212 1213 return true; 1214 } 1215 1216 /// Return the opcode that does not set flags when possible - otherwise 1217 /// return the original opcode. The caller is responsible to do the actual 1218 /// substitution and legality checking. 1219 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1220 // Don't convert all compare instructions, because for some the zero register 1221 // encoding becomes the sp register. 1222 bool MIDefinesZeroReg = false; 1223 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1224 MIDefinesZeroReg = true; 1225 1226 switch (MI.getOpcode()) { 1227 default: 1228 return MI.getOpcode(); 1229 case AArch64::ADDSWrr: 1230 return AArch64::ADDWrr; 1231 case AArch64::ADDSWri: 1232 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1233 case AArch64::ADDSWrs: 1234 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1235 case AArch64::ADDSWrx: 1236 return AArch64::ADDWrx; 1237 case AArch64::ADDSXrr: 1238 return AArch64::ADDXrr; 1239 case AArch64::ADDSXri: 1240 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1241 case AArch64::ADDSXrs: 1242 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1243 case AArch64::ADDSXrx: 1244 return AArch64::ADDXrx; 1245 case AArch64::SUBSWrr: 1246 return AArch64::SUBWrr; 1247 case AArch64::SUBSWri: 1248 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1249 case AArch64::SUBSWrs: 1250 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1251 case AArch64::SUBSWrx: 1252 return AArch64::SUBWrx; 1253 case AArch64::SUBSXrr: 1254 return AArch64::SUBXrr; 1255 case AArch64::SUBSXri: 1256 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1257 case AArch64::SUBSXrs: 1258 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1259 case AArch64::SUBSXrx: 1260 return AArch64::SUBXrx; 1261 } 1262 } 1263 1264 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1265 1266 /// True when condition flags are accessed (either by writing or reading) 1267 /// on the instruction trace starting at From and ending at To. 1268 /// 1269 /// Note: If From and To are from different blocks it's assumed CC are accessed 1270 /// on the path. 1271 static bool areCFlagsAccessedBetweenInstrs( 1272 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1273 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1274 // Early exit if To is at the beginning of the BB. 1275 if (To == To->getParent()->begin()) 1276 return true; 1277 1278 // Check whether the instructions are in the same basic block 1279 // If not, assume the condition flags might get modified somewhere. 1280 if (To->getParent() != From->getParent()) 1281 return true; 1282 1283 // From must be above To. 1284 assert(std::any_of( 1285 ++To.getReverse(), To->getParent()->rend(), 1286 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1287 1288 // We iterate backward starting at \p To until we hit \p From. 1289 for (const MachineInstr &Instr : 1290 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1291 if (((AccessToCheck & AK_Write) && 1292 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1293 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1294 return true; 1295 } 1296 return false; 1297 } 1298 1299 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1300 /// operation which could set the flags in an identical manner 1301 bool AArch64InstrInfo::optimizePTestInstr( 1302 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1303 const MachineRegisterInfo *MRI) const { 1304 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1305 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1306 auto NewOp = Pred->getOpcode(); 1307 bool OpChanged = false; 1308 1309 unsigned MaskOpcode = Mask->getOpcode(); 1310 unsigned PredOpcode = Pred->getOpcode(); 1311 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1312 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1313 1314 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { 1315 // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't 1316 // deactivate any lanes OTHER_INST might set. 1317 uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); 1318 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1319 1320 // Must be an all active predicate of matching element size. 1321 if ((PredElementSize != MaskElementSize) || 1322 (Mask->getOperand(1).getImm() != 31)) 1323 return false; 1324 1325 // Fallthough to simply remove the PTEST. 1326 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { 1327 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1328 // instruction that sets the flags as PTEST would. 1329 1330 // Fallthough to simply remove the PTEST. 1331 } else if (PredIsPTestLike) { 1332 // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both 1333 // instructions use the same predicate. 1334 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1335 if (Mask != PTestLikeMask) 1336 return false; 1337 1338 // Fallthough to simply remove the PTEST. 1339 } else { 1340 switch (Pred->getOpcode()) { 1341 case AArch64::BRKB_PPzP: 1342 case AArch64::BRKPB_PPzPP: { 1343 // Op 0 is chain, 1 is the mask, 2 the previous predicate to 1344 // propagate, 3 the new predicate. 1345 1346 // Check to see if our mask is the same as the brkpb's. If 1347 // not the resulting flag bits may be different and we 1348 // can't remove the ptest. 1349 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1350 if (Mask != PredMask) 1351 return false; 1352 1353 // Switch to the new opcode 1354 NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP 1355 : AArch64::BRKPBS_PPzPP; 1356 OpChanged = true; 1357 break; 1358 } 1359 case AArch64::BRKN_PPzP: { 1360 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1361 if (Mask != PredMask) 1362 return false; 1363 1364 NewOp = AArch64::BRKNS_PPzP; 1365 OpChanged = true; 1366 break; 1367 } 1368 default: 1369 // Bail out if we don't recognize the input 1370 return false; 1371 } 1372 } 1373 1374 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1375 1376 // If the predicate is in a different block (possibly because its been 1377 // hoisted out), then assume the flags are set in between statements. 1378 if (Pred->getParent() != PTest->getParent()) 1379 return false; 1380 1381 // If another instruction between the propagation and test sets the 1382 // flags, don't remove the ptest. 1383 MachineBasicBlock::iterator I = Pred, E = PTest; 1384 ++I; // Skip past the predicate op itself. 1385 for (; I != E; ++I) { 1386 const MachineInstr &Inst = *I; 1387 1388 // TODO: If the ptest flags are unused, we could still remove it. 1389 if (Inst.modifiesRegister(AArch64::NZCV, TRI)) 1390 return false; 1391 } 1392 1393 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1394 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1395 // operand to be replaced with an equivalent instruction that also sets the 1396 // flags. 1397 Pred->setDesc(get(NewOp)); 1398 PTest->eraseFromParent(); 1399 if (OpChanged) { 1400 bool succeeded = UpdateOperandRegClass(*Pred); 1401 (void)succeeded; 1402 assert(succeeded && "Operands have incompatible register classes!"); 1403 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1404 } 1405 1406 // Ensure that the flags def is live. 1407 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1408 unsigned i = 0, e = Pred->getNumOperands(); 1409 for (; i != e; ++i) { 1410 MachineOperand &MO = Pred->getOperand(i); 1411 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1412 MO.setIsDead(false); 1413 break; 1414 } 1415 } 1416 } 1417 return true; 1418 } 1419 1420 /// Try to optimize a compare instruction. A compare instruction is an 1421 /// instruction which produces AArch64::NZCV. It can be truly compare 1422 /// instruction 1423 /// when there are no uses of its destination register. 1424 /// 1425 /// The following steps are tried in order: 1426 /// 1. Convert CmpInstr into an unconditional version. 1427 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1428 /// condition code or an instruction which can be converted into such an 1429 /// instruction. 1430 /// Only comparison with zero is supported. 1431 bool AArch64InstrInfo::optimizeCompareInstr( 1432 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, 1433 int CmpValue, const MachineRegisterInfo *MRI) const { 1434 assert(CmpInstr.getParent()); 1435 assert(MRI); 1436 1437 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1438 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1439 if (DeadNZCVIdx != -1) { 1440 if (CmpInstr.definesRegister(AArch64::WZR) || 1441 CmpInstr.definesRegister(AArch64::XZR)) { 1442 CmpInstr.eraseFromParent(); 1443 return true; 1444 } 1445 unsigned Opc = CmpInstr.getOpcode(); 1446 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1447 if (NewOpc == Opc) 1448 return false; 1449 const MCInstrDesc &MCID = get(NewOpc); 1450 CmpInstr.setDesc(MCID); 1451 CmpInstr.RemoveOperand(DeadNZCVIdx); 1452 bool succeeded = UpdateOperandRegClass(CmpInstr); 1453 (void)succeeded; 1454 assert(succeeded && "Some operands reg class are incompatible!"); 1455 return true; 1456 } 1457 1458 if (CmpInstr.getOpcode() == AArch64::PTEST_PP) 1459 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1460 1461 // Continue only if we have a "ri" where immediate is zero. 1462 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare 1463 // function. 1464 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); 1465 if (CmpValue != 0 || SrcReg2 != 0) 1466 return false; 1467 1468 // CmpInstr is a Compare instruction if destination register is not used. 1469 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1470 return false; 1471 1472 return substituteCmpToZero(CmpInstr, SrcReg, MRI); 1473 } 1474 1475 /// Get opcode of S version of Instr. 1476 /// If Instr is S version its opcode is returned. 1477 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1478 /// or we are not interested in it. 1479 static unsigned sForm(MachineInstr &Instr) { 1480 switch (Instr.getOpcode()) { 1481 default: 1482 return AArch64::INSTRUCTION_LIST_END; 1483 1484 case AArch64::ADDSWrr: 1485 case AArch64::ADDSWri: 1486 case AArch64::ADDSXrr: 1487 case AArch64::ADDSXri: 1488 case AArch64::SUBSWrr: 1489 case AArch64::SUBSWri: 1490 case AArch64::SUBSXrr: 1491 case AArch64::SUBSXri: 1492 return Instr.getOpcode(); 1493 1494 case AArch64::ADDWrr: 1495 return AArch64::ADDSWrr; 1496 case AArch64::ADDWri: 1497 return AArch64::ADDSWri; 1498 case AArch64::ADDXrr: 1499 return AArch64::ADDSXrr; 1500 case AArch64::ADDXri: 1501 return AArch64::ADDSXri; 1502 case AArch64::ADCWr: 1503 return AArch64::ADCSWr; 1504 case AArch64::ADCXr: 1505 return AArch64::ADCSXr; 1506 case AArch64::SUBWrr: 1507 return AArch64::SUBSWrr; 1508 case AArch64::SUBWri: 1509 return AArch64::SUBSWri; 1510 case AArch64::SUBXrr: 1511 return AArch64::SUBSXrr; 1512 case AArch64::SUBXri: 1513 return AArch64::SUBSXri; 1514 case AArch64::SBCWr: 1515 return AArch64::SBCSWr; 1516 case AArch64::SBCXr: 1517 return AArch64::SBCSXr; 1518 case AArch64::ANDWri: 1519 return AArch64::ANDSWri; 1520 case AArch64::ANDXri: 1521 return AArch64::ANDSXri; 1522 } 1523 } 1524 1525 /// Check if AArch64::NZCV should be alive in successors of MBB. 1526 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { 1527 for (auto *BB : MBB->successors()) 1528 if (BB->isLiveIn(AArch64::NZCV)) 1529 return true; 1530 return false; 1531 } 1532 1533 namespace { 1534 1535 struct UsedNZCV { 1536 bool N = false; 1537 bool Z = false; 1538 bool C = false; 1539 bool V = false; 1540 1541 UsedNZCV() = default; 1542 1543 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1544 this->N |= UsedFlags.N; 1545 this->Z |= UsedFlags.Z; 1546 this->C |= UsedFlags.C; 1547 this->V |= UsedFlags.V; 1548 return *this; 1549 } 1550 }; 1551 1552 } // end anonymous namespace 1553 1554 /// Find a condition code used by the instruction. 1555 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1556 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1557 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1558 switch (Instr.getOpcode()) { 1559 default: 1560 return AArch64CC::Invalid; 1561 1562 case AArch64::Bcc: { 1563 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1564 assert(Idx >= 2); 1565 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); 1566 } 1567 1568 case AArch64::CSINVWr: 1569 case AArch64::CSINVXr: 1570 case AArch64::CSINCWr: 1571 case AArch64::CSINCXr: 1572 case AArch64::CSELWr: 1573 case AArch64::CSELXr: 1574 case AArch64::CSNEGWr: 1575 case AArch64::CSNEGXr: 1576 case AArch64::FCSELSrrr: 1577 case AArch64::FCSELDrrr: { 1578 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1579 assert(Idx >= 1); 1580 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); 1581 } 1582 } 1583 } 1584 1585 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1586 assert(CC != AArch64CC::Invalid); 1587 UsedNZCV UsedFlags; 1588 switch (CC) { 1589 default: 1590 break; 1591 1592 case AArch64CC::EQ: // Z set 1593 case AArch64CC::NE: // Z clear 1594 UsedFlags.Z = true; 1595 break; 1596 1597 case AArch64CC::HI: // Z clear and C set 1598 case AArch64CC::LS: // Z set or C clear 1599 UsedFlags.Z = true; 1600 LLVM_FALLTHROUGH; 1601 case AArch64CC::HS: // C set 1602 case AArch64CC::LO: // C clear 1603 UsedFlags.C = true; 1604 break; 1605 1606 case AArch64CC::MI: // N set 1607 case AArch64CC::PL: // N clear 1608 UsedFlags.N = true; 1609 break; 1610 1611 case AArch64CC::VS: // V set 1612 case AArch64CC::VC: // V clear 1613 UsedFlags.V = true; 1614 break; 1615 1616 case AArch64CC::GT: // Z clear, N and V the same 1617 case AArch64CC::LE: // Z set, N and V differ 1618 UsedFlags.Z = true; 1619 LLVM_FALLTHROUGH; 1620 case AArch64CC::GE: // N and V the same 1621 case AArch64CC::LT: // N and V differ 1622 UsedFlags.N = true; 1623 UsedFlags.V = true; 1624 break; 1625 } 1626 return UsedFlags; 1627 } 1628 1629 static bool isADDSRegImm(unsigned Opcode) { 1630 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1631 } 1632 1633 static bool isSUBSRegImm(unsigned Opcode) { 1634 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1635 } 1636 1637 /// Check if CmpInstr can be substituted by MI. 1638 /// 1639 /// CmpInstr can be substituted: 1640 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1641 /// - and, MI and CmpInstr are from the same MachineBB 1642 /// - and, condition flags are not alive in successors of the CmpInstr parent 1643 /// - and, if MI opcode is the S form there must be no defs of flags between 1644 /// MI and CmpInstr 1645 /// or if MI opcode is not the S form there must be neither defs of flags 1646 /// nor uses of flags between MI and CmpInstr. 1647 /// - and C/V flags are not used after CmpInstr 1648 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, 1649 const TargetRegisterInfo *TRI) { 1650 assert(MI); 1651 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); 1652 assert(CmpInstr); 1653 1654 const unsigned CmpOpcode = CmpInstr->getOpcode(); 1655 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1656 return false; 1657 1658 if (MI->getParent() != CmpInstr->getParent()) 1659 return false; 1660 1661 if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) 1662 return false; 1663 1664 AccessKind AccessToCheck = AK_Write; 1665 if (sForm(*MI) != MI->getOpcode()) 1666 AccessToCheck = AK_All; 1667 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) 1668 return false; 1669 1670 UsedNZCV NZCVUsedAfterCmp; 1671 for (const MachineInstr &Instr : 1672 instructionsWithoutDebug(std::next(CmpInstr->getIterator()), 1673 CmpInstr->getParent()->instr_end())) { 1674 if (Instr.readsRegister(AArch64::NZCV, TRI)) { 1675 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1676 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1677 return false; 1678 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1679 } 1680 1681 if (Instr.modifiesRegister(AArch64::NZCV, TRI)) 1682 break; 1683 } 1684 1685 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; 1686 } 1687 1688 /// Substitute an instruction comparing to zero with another instruction 1689 /// which produces needed condition flags. 1690 /// 1691 /// Return true on success. 1692 bool AArch64InstrInfo::substituteCmpToZero( 1693 MachineInstr &CmpInstr, unsigned SrcReg, 1694 const MachineRegisterInfo *MRI) const { 1695 assert(MRI); 1696 // Get the unique definition of SrcReg. 1697 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); 1698 if (!MI) 1699 return false; 1700 1701 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1702 1703 unsigned NewOpc = sForm(*MI); 1704 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1705 return false; 1706 1707 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) 1708 return false; 1709 1710 // Update the instruction to set NZCV. 1711 MI->setDesc(get(NewOpc)); 1712 CmpInstr.eraseFromParent(); 1713 bool succeeded = UpdateOperandRegClass(*MI); 1714 (void)succeeded; 1715 assert(succeeded && "Some operands reg class are incompatible!"); 1716 MI->addRegisterDefined(AArch64::NZCV, TRI); 1717 return true; 1718 } 1719 1720 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1721 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1722 MI.getOpcode() != AArch64::CATCHRET) 1723 return false; 1724 1725 MachineBasicBlock &MBB = *MI.getParent(); 1726 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1727 auto TRI = Subtarget.getRegisterInfo(); 1728 DebugLoc DL = MI.getDebugLoc(); 1729 1730 if (MI.getOpcode() == AArch64::CATCHRET) { 1731 // Skip to the first instruction before the epilog. 1732 const TargetInstrInfo *TII = 1733 MBB.getParent()->getSubtarget().getInstrInfo(); 1734 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1735 auto MBBI = MachineBasicBlock::iterator(MI); 1736 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1737 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1738 FirstEpilogSEH != MBB.begin()) 1739 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1740 if (FirstEpilogSEH != MBB.begin()) 1741 FirstEpilogSEH = std::next(FirstEpilogSEH); 1742 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1743 .addReg(AArch64::X0, RegState::Define) 1744 .addMBB(TargetMBB); 1745 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1746 .addReg(AArch64::X0, RegState::Define) 1747 .addReg(AArch64::X0) 1748 .addMBB(TargetMBB) 1749 .addImm(0); 1750 return true; 1751 } 1752 1753 Register Reg = MI.getOperand(0).getReg(); 1754 const GlobalValue *GV = 1755 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1756 const TargetMachine &TM = MBB.getParent()->getTarget(); 1757 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1758 const unsigned char MO_NC = AArch64II::MO_NC; 1759 1760 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1761 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1762 .addGlobalAddress(GV, 0, OpFlags); 1763 if (Subtarget.isTargetILP32()) { 1764 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1765 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1766 .addDef(Reg32, RegState::Dead) 1767 .addUse(Reg, RegState::Kill) 1768 .addImm(0) 1769 .addMemOperand(*MI.memoperands_begin()) 1770 .addDef(Reg, RegState::Implicit); 1771 } else { 1772 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1773 .addReg(Reg, RegState::Kill) 1774 .addImm(0) 1775 .addMemOperand(*MI.memoperands_begin()); 1776 } 1777 } else if (TM.getCodeModel() == CodeModel::Large) { 1778 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1779 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1780 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1781 .addImm(0); 1782 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1783 .addReg(Reg, RegState::Kill) 1784 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 1785 .addImm(16); 1786 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1787 .addReg(Reg, RegState::Kill) 1788 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 1789 .addImm(32); 1790 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1791 .addReg(Reg, RegState::Kill) 1792 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 1793 .addImm(48); 1794 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1795 .addReg(Reg, RegState::Kill) 1796 .addImm(0) 1797 .addMemOperand(*MI.memoperands_begin()); 1798 } else if (TM.getCodeModel() == CodeModel::Tiny) { 1799 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 1800 .addGlobalAddress(GV, 0, OpFlags); 1801 } else { 1802 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 1803 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 1804 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 1805 if (Subtarget.isTargetILP32()) { 1806 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1807 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1808 .addDef(Reg32, RegState::Dead) 1809 .addUse(Reg, RegState::Kill) 1810 .addGlobalAddress(GV, 0, LoFlags) 1811 .addMemOperand(*MI.memoperands_begin()) 1812 .addDef(Reg, RegState::Implicit); 1813 } else { 1814 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1815 .addReg(Reg, RegState::Kill) 1816 .addGlobalAddress(GV, 0, LoFlags) 1817 .addMemOperand(*MI.memoperands_begin()); 1818 } 1819 } 1820 1821 MBB.erase(MI); 1822 1823 return true; 1824 } 1825 1826 // Return true if this instruction simply sets its single destination register 1827 // to zero. This is equivalent to a register rename of the zero-register. 1828 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 1829 switch (MI.getOpcode()) { 1830 default: 1831 break; 1832 case AArch64::MOVZWi: 1833 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 1834 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 1835 assert(MI.getDesc().getNumOperands() == 3 && 1836 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 1837 return true; 1838 } 1839 break; 1840 case AArch64::ANDWri: // and Rd, Rzr, #imm 1841 return MI.getOperand(1).getReg() == AArch64::WZR; 1842 case AArch64::ANDXri: 1843 return MI.getOperand(1).getReg() == AArch64::XZR; 1844 case TargetOpcode::COPY: 1845 return MI.getOperand(1).getReg() == AArch64::WZR; 1846 } 1847 return false; 1848 } 1849 1850 // Return true if this instruction simply renames a general register without 1851 // modifying bits. 1852 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 1853 switch (MI.getOpcode()) { 1854 default: 1855 break; 1856 case TargetOpcode::COPY: { 1857 // GPR32 copies will by lowered to ORRXrs 1858 Register DstReg = MI.getOperand(0).getReg(); 1859 return (AArch64::GPR32RegClass.contains(DstReg) || 1860 AArch64::GPR64RegClass.contains(DstReg)); 1861 } 1862 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 1863 if (MI.getOperand(1).getReg() == AArch64::XZR) { 1864 assert(MI.getDesc().getNumOperands() == 4 && 1865 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 1866 return true; 1867 } 1868 break; 1869 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 1870 if (MI.getOperand(2).getImm() == 0) { 1871 assert(MI.getDesc().getNumOperands() == 4 && 1872 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 1873 return true; 1874 } 1875 break; 1876 } 1877 return false; 1878 } 1879 1880 // Return true if this instruction simply renames a general register without 1881 // modifying bits. 1882 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 1883 switch (MI.getOpcode()) { 1884 default: 1885 break; 1886 case TargetOpcode::COPY: { 1887 // FPR64 copies will by lowered to ORR.16b 1888 Register DstReg = MI.getOperand(0).getReg(); 1889 return (AArch64::FPR64RegClass.contains(DstReg) || 1890 AArch64::FPR128RegClass.contains(DstReg)); 1891 } 1892 case AArch64::ORRv16i8: 1893 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 1894 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 1895 "invalid ORRv16i8 operands"); 1896 return true; 1897 } 1898 break; 1899 } 1900 return false; 1901 } 1902 1903 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 1904 int &FrameIndex) const { 1905 switch (MI.getOpcode()) { 1906 default: 1907 break; 1908 case AArch64::LDRWui: 1909 case AArch64::LDRXui: 1910 case AArch64::LDRBui: 1911 case AArch64::LDRHui: 1912 case AArch64::LDRSui: 1913 case AArch64::LDRDui: 1914 case AArch64::LDRQui: 1915 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1916 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1917 FrameIndex = MI.getOperand(1).getIndex(); 1918 return MI.getOperand(0).getReg(); 1919 } 1920 break; 1921 } 1922 1923 return 0; 1924 } 1925 1926 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 1927 int &FrameIndex) const { 1928 switch (MI.getOpcode()) { 1929 default: 1930 break; 1931 case AArch64::STRWui: 1932 case AArch64::STRXui: 1933 case AArch64::STRBui: 1934 case AArch64::STRHui: 1935 case AArch64::STRSui: 1936 case AArch64::STRDui: 1937 case AArch64::STRQui: 1938 case AArch64::LDR_PXI: 1939 case AArch64::STR_PXI: 1940 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1941 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1942 FrameIndex = MI.getOperand(1).getIndex(); 1943 return MI.getOperand(0).getReg(); 1944 } 1945 break; 1946 } 1947 return 0; 1948 } 1949 1950 /// Check all MachineMemOperands for a hint to suppress pairing. 1951 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 1952 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1953 return MMO->getFlags() & MOSuppressPair; 1954 }); 1955 } 1956 1957 /// Set a flag on the first MachineMemOperand to suppress pairing. 1958 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 1959 if (MI.memoperands_empty()) 1960 return; 1961 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 1962 } 1963 1964 /// Check all MachineMemOperands for a hint that the load/store is strided. 1965 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 1966 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1967 return MMO->getFlags() & MOStridedAccess; 1968 }); 1969 } 1970 1971 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { 1972 switch (Opc) { 1973 default: 1974 return false; 1975 case AArch64::STURSi: 1976 case AArch64::STURDi: 1977 case AArch64::STURQi: 1978 case AArch64::STURBBi: 1979 case AArch64::STURHHi: 1980 case AArch64::STURWi: 1981 case AArch64::STURXi: 1982 case AArch64::LDURSi: 1983 case AArch64::LDURDi: 1984 case AArch64::LDURQi: 1985 case AArch64::LDURWi: 1986 case AArch64::LDURXi: 1987 case AArch64::LDURSWi: 1988 case AArch64::LDURHHi: 1989 case AArch64::LDURBBi: 1990 case AArch64::LDURSBWi: 1991 case AArch64::LDURSHWi: 1992 return true; 1993 } 1994 } 1995 1996 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 1997 switch (Opc) { 1998 default: return {}; 1999 case AArch64::PRFMui: return AArch64::PRFUMi; 2000 case AArch64::LDRXui: return AArch64::LDURXi; 2001 case AArch64::LDRWui: return AArch64::LDURWi; 2002 case AArch64::LDRBui: return AArch64::LDURBi; 2003 case AArch64::LDRHui: return AArch64::LDURHi; 2004 case AArch64::LDRSui: return AArch64::LDURSi; 2005 case AArch64::LDRDui: return AArch64::LDURDi; 2006 case AArch64::LDRQui: return AArch64::LDURQi; 2007 case AArch64::LDRBBui: return AArch64::LDURBBi; 2008 case AArch64::LDRHHui: return AArch64::LDURHHi; 2009 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2010 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2011 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2012 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2013 case AArch64::LDRSWui: return AArch64::LDURSWi; 2014 case AArch64::STRXui: return AArch64::STURXi; 2015 case AArch64::STRWui: return AArch64::STURWi; 2016 case AArch64::STRBui: return AArch64::STURBi; 2017 case AArch64::STRHui: return AArch64::STURHi; 2018 case AArch64::STRSui: return AArch64::STURSi; 2019 case AArch64::STRDui: return AArch64::STURDi; 2020 case AArch64::STRQui: return AArch64::STURQi; 2021 case AArch64::STRBBui: return AArch64::STURBBi; 2022 case AArch64::STRHHui: return AArch64::STURHHi; 2023 } 2024 } 2025 2026 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2027 switch (Opc) { 2028 default: 2029 return 2; 2030 case AArch64::LDPXi: 2031 case AArch64::LDPDi: 2032 case AArch64::STPXi: 2033 case AArch64::STPDi: 2034 case AArch64::LDNPXi: 2035 case AArch64::LDNPDi: 2036 case AArch64::STNPXi: 2037 case AArch64::STNPDi: 2038 case AArch64::LDPQi: 2039 case AArch64::STPQi: 2040 case AArch64::LDNPQi: 2041 case AArch64::STNPQi: 2042 case AArch64::LDPWi: 2043 case AArch64::LDPSi: 2044 case AArch64::STPWi: 2045 case AArch64::STPSi: 2046 case AArch64::LDNPWi: 2047 case AArch64::LDNPSi: 2048 case AArch64::STNPWi: 2049 case AArch64::STNPSi: 2050 case AArch64::LDG: 2051 case AArch64::STGPi: 2052 case AArch64::LD1B_IMM: 2053 case AArch64::LD1H_IMM: 2054 case AArch64::LD1W_IMM: 2055 case AArch64::LD1D_IMM: 2056 case AArch64::ST1B_IMM: 2057 case AArch64::ST1H_IMM: 2058 case AArch64::ST1W_IMM: 2059 case AArch64::ST1D_IMM: 2060 case AArch64::LD1B_H_IMM: 2061 case AArch64::LD1SB_H_IMM: 2062 case AArch64::LD1H_S_IMM: 2063 case AArch64::LD1SH_S_IMM: 2064 case AArch64::LD1W_D_IMM: 2065 case AArch64::LD1SW_D_IMM: 2066 case AArch64::ST1B_H_IMM: 2067 case AArch64::ST1H_S_IMM: 2068 case AArch64::ST1W_D_IMM: 2069 case AArch64::LD1B_S_IMM: 2070 case AArch64::LD1SB_S_IMM: 2071 case AArch64::LD1H_D_IMM: 2072 case AArch64::LD1SH_D_IMM: 2073 case AArch64::ST1B_S_IMM: 2074 case AArch64::ST1H_D_IMM: 2075 case AArch64::LD1B_D_IMM: 2076 case AArch64::LD1SB_D_IMM: 2077 case AArch64::ST1B_D_IMM: 2078 return 3; 2079 case AArch64::ADDG: 2080 case AArch64::STGOffset: 2081 case AArch64::LDR_PXI: 2082 case AArch64::STR_PXI: 2083 return 2; 2084 } 2085 } 2086 2087 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2088 switch (MI.getOpcode()) { 2089 default: 2090 return false; 2091 // Scaled instructions. 2092 case AArch64::STRSui: 2093 case AArch64::STRDui: 2094 case AArch64::STRQui: 2095 case AArch64::STRXui: 2096 case AArch64::STRWui: 2097 case AArch64::LDRSui: 2098 case AArch64::LDRDui: 2099 case AArch64::LDRQui: 2100 case AArch64::LDRXui: 2101 case AArch64::LDRWui: 2102 case AArch64::LDRSWui: 2103 // Unscaled instructions. 2104 case AArch64::STURSi: 2105 case AArch64::STURDi: 2106 case AArch64::STURQi: 2107 case AArch64::STURWi: 2108 case AArch64::STURXi: 2109 case AArch64::LDURSi: 2110 case AArch64::LDURDi: 2111 case AArch64::LDURQi: 2112 case AArch64::LDURWi: 2113 case AArch64::LDURXi: 2114 case AArch64::LDURSWi: 2115 return true; 2116 } 2117 } 2118 2119 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 2120 bool &Is64Bit) { 2121 switch (Opc) { 2122 default: 2123 llvm_unreachable("Opcode has no flag setting equivalent!"); 2124 // 32-bit cases: 2125 case AArch64::ADDWri: 2126 Is64Bit = false; 2127 return AArch64::ADDSWri; 2128 case AArch64::ADDWrr: 2129 Is64Bit = false; 2130 return AArch64::ADDSWrr; 2131 case AArch64::ADDWrs: 2132 Is64Bit = false; 2133 return AArch64::ADDSWrs; 2134 case AArch64::ADDWrx: 2135 Is64Bit = false; 2136 return AArch64::ADDSWrx; 2137 case AArch64::ANDWri: 2138 Is64Bit = false; 2139 return AArch64::ANDSWri; 2140 case AArch64::ANDWrr: 2141 Is64Bit = false; 2142 return AArch64::ANDSWrr; 2143 case AArch64::ANDWrs: 2144 Is64Bit = false; 2145 return AArch64::ANDSWrs; 2146 case AArch64::BICWrr: 2147 Is64Bit = false; 2148 return AArch64::BICSWrr; 2149 case AArch64::BICWrs: 2150 Is64Bit = false; 2151 return AArch64::BICSWrs; 2152 case AArch64::SUBWri: 2153 Is64Bit = false; 2154 return AArch64::SUBSWri; 2155 case AArch64::SUBWrr: 2156 Is64Bit = false; 2157 return AArch64::SUBSWrr; 2158 case AArch64::SUBWrs: 2159 Is64Bit = false; 2160 return AArch64::SUBSWrs; 2161 case AArch64::SUBWrx: 2162 Is64Bit = false; 2163 return AArch64::SUBSWrx; 2164 // 64-bit cases: 2165 case AArch64::ADDXri: 2166 Is64Bit = true; 2167 return AArch64::ADDSXri; 2168 case AArch64::ADDXrr: 2169 Is64Bit = true; 2170 return AArch64::ADDSXrr; 2171 case AArch64::ADDXrs: 2172 Is64Bit = true; 2173 return AArch64::ADDSXrs; 2174 case AArch64::ADDXrx: 2175 Is64Bit = true; 2176 return AArch64::ADDSXrx; 2177 case AArch64::ANDXri: 2178 Is64Bit = true; 2179 return AArch64::ANDSXri; 2180 case AArch64::ANDXrr: 2181 Is64Bit = true; 2182 return AArch64::ANDSXrr; 2183 case AArch64::ANDXrs: 2184 Is64Bit = true; 2185 return AArch64::ANDSXrs; 2186 case AArch64::BICXrr: 2187 Is64Bit = true; 2188 return AArch64::BICSXrr; 2189 case AArch64::BICXrs: 2190 Is64Bit = true; 2191 return AArch64::BICSXrs; 2192 case AArch64::SUBXri: 2193 Is64Bit = true; 2194 return AArch64::SUBSXri; 2195 case AArch64::SUBXrr: 2196 Is64Bit = true; 2197 return AArch64::SUBSXrr; 2198 case AArch64::SUBXrs: 2199 Is64Bit = true; 2200 return AArch64::SUBSXrs; 2201 case AArch64::SUBXrx: 2202 Is64Bit = true; 2203 return AArch64::SUBSXrx; 2204 } 2205 } 2206 2207 // Is this a candidate for ld/st merging or pairing? For example, we don't 2208 // touch volatiles or load/stores that have a hint to avoid pair formation. 2209 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2210 // If this is a volatile load/store, don't mess with it. 2211 if (MI.hasOrderedMemoryRef()) 2212 return false; 2213 2214 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2215 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && 2216 "Expected a reg or frame index operand."); 2217 if (!MI.getOperand(2).isImm()) 2218 return false; 2219 2220 // Can't merge/pair if the instruction modifies the base register. 2221 // e.g., ldr x0, [x0] 2222 // This case will never occur with an FI base. 2223 if (MI.getOperand(1).isReg()) { 2224 Register BaseReg = MI.getOperand(1).getReg(); 2225 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2226 if (MI.modifiesRegister(BaseReg, TRI)) 2227 return false; 2228 } 2229 2230 // Check if this load/store has a hint to avoid pair formation. 2231 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2232 if (isLdStPairSuppressed(MI)) 2233 return false; 2234 2235 // Do not pair any callee-save store/reload instructions in the 2236 // prologue/epilogue if the CFI information encoded the operations as separate 2237 // instructions, as that will cause the size of the actual prologue to mismatch 2238 // with the prologue size recorded in the Windows CFI. 2239 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2240 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2241 MI.getMF()->getFunction().needsUnwindTableEntry(); 2242 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2243 MI.getFlag(MachineInstr::FrameDestroy))) 2244 return false; 2245 2246 // On some CPUs quad load/store pairs are slower than two single load/stores. 2247 if (Subtarget.isPaired128Slow()) { 2248 switch (MI.getOpcode()) { 2249 default: 2250 break; 2251 case AArch64::LDURQi: 2252 case AArch64::STURQi: 2253 case AArch64::LDRQui: 2254 case AArch64::STRQui: 2255 return false; 2256 } 2257 } 2258 2259 return true; 2260 } 2261 2262 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2263 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2264 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2265 const TargetRegisterInfo *TRI) const { 2266 if (!LdSt.mayLoadOrStore()) 2267 return false; 2268 2269 const MachineOperand *BaseOp; 2270 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2271 Width, TRI)) 2272 return false; 2273 BaseOps.push_back(BaseOp); 2274 return true; 2275 } 2276 2277 Optional<ExtAddrMode> 2278 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2279 const TargetRegisterInfo *TRI) const { 2280 const MachineOperand *Base; // Filled with the base operand of MI. 2281 int64_t Offset; // Filled with the offset of MI. 2282 bool OffsetIsScalable; 2283 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2284 return None; 2285 2286 if (!Base->isReg()) 2287 return None; 2288 ExtAddrMode AM; 2289 AM.BaseReg = Base->getReg(); 2290 AM.Displacement = Offset; 2291 AM.ScaledReg = 0; 2292 return AM; 2293 } 2294 2295 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2296 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2297 bool &OffsetIsScalable, unsigned &Width, 2298 const TargetRegisterInfo *TRI) const { 2299 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2300 // Handle only loads/stores with base register followed by immediate offset. 2301 if (LdSt.getNumExplicitOperands() == 3) { 2302 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2303 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2304 !LdSt.getOperand(2).isImm()) 2305 return false; 2306 } else if (LdSt.getNumExplicitOperands() == 4) { 2307 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2308 if (!LdSt.getOperand(1).isReg() || 2309 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2310 !LdSt.getOperand(3).isImm()) 2311 return false; 2312 } else 2313 return false; 2314 2315 // Get the scaling factor for the instruction and set the width for the 2316 // instruction. 2317 TypeSize Scale(0U, false); 2318 int64_t Dummy1, Dummy2; 2319 2320 // If this returns false, then it's an instruction we don't want to handle. 2321 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2322 return false; 2323 2324 // Compute the offset. Offset is calculated as the immediate operand 2325 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2326 // set to 1. 2327 if (LdSt.getNumExplicitOperands() == 3) { 2328 BaseOp = &LdSt.getOperand(1); 2329 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); 2330 } else { 2331 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2332 BaseOp = &LdSt.getOperand(2); 2333 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); 2334 } 2335 OffsetIsScalable = Scale.isScalable(); 2336 2337 if (!BaseOp->isReg() && !BaseOp->isFI()) 2338 return false; 2339 2340 return true; 2341 } 2342 2343 MachineOperand & 2344 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2345 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2346 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2347 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2348 return OfsOp; 2349 } 2350 2351 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2352 unsigned &Width, int64_t &MinOffset, 2353 int64_t &MaxOffset) { 2354 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2355 switch (Opcode) { 2356 // Not a memory operation or something we want to handle. 2357 default: 2358 Scale = TypeSize::Fixed(0); 2359 Width = 0; 2360 MinOffset = MaxOffset = 0; 2361 return false; 2362 case AArch64::STRWpost: 2363 case AArch64::LDRWpost: 2364 Width = 32; 2365 Scale = TypeSize::Fixed(4); 2366 MinOffset = -256; 2367 MaxOffset = 255; 2368 break; 2369 case AArch64::LDURQi: 2370 case AArch64::STURQi: 2371 Width = 16; 2372 Scale = TypeSize::Fixed(1); 2373 MinOffset = -256; 2374 MaxOffset = 255; 2375 break; 2376 case AArch64::PRFUMi: 2377 case AArch64::LDURXi: 2378 case AArch64::LDURDi: 2379 case AArch64::STURXi: 2380 case AArch64::STURDi: 2381 Width = 8; 2382 Scale = TypeSize::Fixed(1); 2383 MinOffset = -256; 2384 MaxOffset = 255; 2385 break; 2386 case AArch64::LDURWi: 2387 case AArch64::LDURSi: 2388 case AArch64::LDURSWi: 2389 case AArch64::STURWi: 2390 case AArch64::STURSi: 2391 Width = 4; 2392 Scale = TypeSize::Fixed(1); 2393 MinOffset = -256; 2394 MaxOffset = 255; 2395 break; 2396 case AArch64::LDURHi: 2397 case AArch64::LDURHHi: 2398 case AArch64::LDURSHXi: 2399 case AArch64::LDURSHWi: 2400 case AArch64::STURHi: 2401 case AArch64::STURHHi: 2402 Width = 2; 2403 Scale = TypeSize::Fixed(1); 2404 MinOffset = -256; 2405 MaxOffset = 255; 2406 break; 2407 case AArch64::LDURBi: 2408 case AArch64::LDURBBi: 2409 case AArch64::LDURSBXi: 2410 case AArch64::LDURSBWi: 2411 case AArch64::STURBi: 2412 case AArch64::STURBBi: 2413 Width = 1; 2414 Scale = TypeSize::Fixed(1); 2415 MinOffset = -256; 2416 MaxOffset = 255; 2417 break; 2418 case AArch64::LDPQi: 2419 case AArch64::LDNPQi: 2420 case AArch64::STPQi: 2421 case AArch64::STNPQi: 2422 Scale = TypeSize::Fixed(16); 2423 Width = 32; 2424 MinOffset = -64; 2425 MaxOffset = 63; 2426 break; 2427 case AArch64::LDRQui: 2428 case AArch64::STRQui: 2429 Scale = TypeSize::Fixed(16); 2430 Width = 16; 2431 MinOffset = 0; 2432 MaxOffset = 4095; 2433 break; 2434 case AArch64::LDPXi: 2435 case AArch64::LDPDi: 2436 case AArch64::LDNPXi: 2437 case AArch64::LDNPDi: 2438 case AArch64::STPXi: 2439 case AArch64::STPDi: 2440 case AArch64::STNPXi: 2441 case AArch64::STNPDi: 2442 Scale = TypeSize::Fixed(8); 2443 Width = 16; 2444 MinOffset = -64; 2445 MaxOffset = 63; 2446 break; 2447 case AArch64::PRFMui: 2448 case AArch64::LDRXui: 2449 case AArch64::LDRDui: 2450 case AArch64::STRXui: 2451 case AArch64::STRDui: 2452 Scale = TypeSize::Fixed(8); 2453 Width = 8; 2454 MinOffset = 0; 2455 MaxOffset = 4095; 2456 break; 2457 case AArch64::LDPWi: 2458 case AArch64::LDPSi: 2459 case AArch64::LDNPWi: 2460 case AArch64::LDNPSi: 2461 case AArch64::STPWi: 2462 case AArch64::STPSi: 2463 case AArch64::STNPWi: 2464 case AArch64::STNPSi: 2465 Scale = TypeSize::Fixed(4); 2466 Width = 8; 2467 MinOffset = -64; 2468 MaxOffset = 63; 2469 break; 2470 case AArch64::LDRWui: 2471 case AArch64::LDRSui: 2472 case AArch64::LDRSWui: 2473 case AArch64::STRWui: 2474 case AArch64::STRSui: 2475 Scale = TypeSize::Fixed(4); 2476 Width = 4; 2477 MinOffset = 0; 2478 MaxOffset = 4095; 2479 break; 2480 case AArch64::LDRHui: 2481 case AArch64::LDRHHui: 2482 case AArch64::LDRSHWui: 2483 case AArch64::LDRSHXui: 2484 case AArch64::STRHui: 2485 case AArch64::STRHHui: 2486 Scale = TypeSize::Fixed(2); 2487 Width = 2; 2488 MinOffset = 0; 2489 MaxOffset = 4095; 2490 break; 2491 case AArch64::LDRBui: 2492 case AArch64::LDRBBui: 2493 case AArch64::LDRSBWui: 2494 case AArch64::LDRSBXui: 2495 case AArch64::STRBui: 2496 case AArch64::STRBBui: 2497 Scale = TypeSize::Fixed(1); 2498 Width = 1; 2499 MinOffset = 0; 2500 MaxOffset = 4095; 2501 break; 2502 case AArch64::ADDG: 2503 Scale = TypeSize::Fixed(16); 2504 Width = 0; 2505 MinOffset = 0; 2506 MaxOffset = 63; 2507 break; 2508 case AArch64::TAGPstack: 2509 Scale = TypeSize::Fixed(16); 2510 Width = 0; 2511 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2512 // of 63 (not 64!). 2513 MinOffset = -63; 2514 MaxOffset = 63; 2515 break; 2516 case AArch64::LDG: 2517 case AArch64::STGOffset: 2518 case AArch64::STZGOffset: 2519 Scale = TypeSize::Fixed(16); 2520 Width = 16; 2521 MinOffset = -256; 2522 MaxOffset = 255; 2523 break; 2524 case AArch64::STR_ZZZZXI: 2525 case AArch64::LDR_ZZZZXI: 2526 Scale = TypeSize::Scalable(16); 2527 Width = SVEMaxBytesPerVector * 4; 2528 MinOffset = -256; 2529 MaxOffset = 252; 2530 break; 2531 case AArch64::STR_ZZZXI: 2532 case AArch64::LDR_ZZZXI: 2533 Scale = TypeSize::Scalable(16); 2534 Width = SVEMaxBytesPerVector * 3; 2535 MinOffset = -256; 2536 MaxOffset = 253; 2537 break; 2538 case AArch64::STR_ZZXI: 2539 case AArch64::LDR_ZZXI: 2540 Scale = TypeSize::Scalable(16); 2541 Width = SVEMaxBytesPerVector * 2; 2542 MinOffset = -256; 2543 MaxOffset = 254; 2544 break; 2545 case AArch64::LDR_PXI: 2546 case AArch64::STR_PXI: 2547 Scale = TypeSize::Scalable(2); 2548 Width = SVEMaxBytesPerVector / 8; 2549 MinOffset = -256; 2550 MaxOffset = 255; 2551 break; 2552 case AArch64::LDR_ZXI: 2553 case AArch64::STR_ZXI: 2554 Scale = TypeSize::Scalable(16); 2555 Width = SVEMaxBytesPerVector; 2556 MinOffset = -256; 2557 MaxOffset = 255; 2558 break; 2559 case AArch64::LD1B_IMM: 2560 case AArch64::LD1H_IMM: 2561 case AArch64::LD1W_IMM: 2562 case AArch64::LD1D_IMM: 2563 case AArch64::ST1B_IMM: 2564 case AArch64::ST1H_IMM: 2565 case AArch64::ST1W_IMM: 2566 case AArch64::ST1D_IMM: 2567 // A full vectors worth of data 2568 // Width = mbytes * elements 2569 Scale = TypeSize::Scalable(16); 2570 Width = SVEMaxBytesPerVector; 2571 MinOffset = -8; 2572 MaxOffset = 7; 2573 break; 2574 case AArch64::LD1B_H_IMM: 2575 case AArch64::LD1SB_H_IMM: 2576 case AArch64::LD1H_S_IMM: 2577 case AArch64::LD1SH_S_IMM: 2578 case AArch64::LD1W_D_IMM: 2579 case AArch64::LD1SW_D_IMM: 2580 case AArch64::ST1B_H_IMM: 2581 case AArch64::ST1H_S_IMM: 2582 case AArch64::ST1W_D_IMM: 2583 // A half vector worth of data 2584 // Width = mbytes * elements 2585 Scale = TypeSize::Scalable(8); 2586 Width = SVEMaxBytesPerVector / 2; 2587 MinOffset = -8; 2588 MaxOffset = 7; 2589 break; 2590 case AArch64::LD1B_S_IMM: 2591 case AArch64::LD1SB_S_IMM: 2592 case AArch64::LD1H_D_IMM: 2593 case AArch64::LD1SH_D_IMM: 2594 case AArch64::ST1B_S_IMM: 2595 case AArch64::ST1H_D_IMM: 2596 // A quarter vector worth of data 2597 // Width = mbytes * elements 2598 Scale = TypeSize::Scalable(4); 2599 Width = SVEMaxBytesPerVector / 4; 2600 MinOffset = -8; 2601 MaxOffset = 7; 2602 break; 2603 case AArch64::LD1B_D_IMM: 2604 case AArch64::LD1SB_D_IMM: 2605 case AArch64::ST1B_D_IMM: 2606 // A eighth vector worth of data 2607 // Width = mbytes * elements 2608 Scale = TypeSize::Scalable(2); 2609 Width = SVEMaxBytesPerVector / 8; 2610 MinOffset = -8; 2611 MaxOffset = 7; 2612 break; 2613 case AArch64::ST2GOffset: 2614 case AArch64::STZ2GOffset: 2615 Scale = TypeSize::Fixed(16); 2616 Width = 32; 2617 MinOffset = -256; 2618 MaxOffset = 255; 2619 break; 2620 case AArch64::STGPi: 2621 Scale = TypeSize::Fixed(16); 2622 Width = 16; 2623 MinOffset = -64; 2624 MaxOffset = 63; 2625 break; 2626 } 2627 2628 return true; 2629 } 2630 2631 // Scaling factor for unscaled load or store. 2632 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2633 switch (Opc) { 2634 default: 2635 llvm_unreachable("Opcode has unknown scale!"); 2636 case AArch64::LDRBBui: 2637 case AArch64::LDURBBi: 2638 case AArch64::LDRSBWui: 2639 case AArch64::LDURSBWi: 2640 case AArch64::STRBBui: 2641 case AArch64::STURBBi: 2642 return 1; 2643 case AArch64::LDRHHui: 2644 case AArch64::LDURHHi: 2645 case AArch64::LDRSHWui: 2646 case AArch64::LDURSHWi: 2647 case AArch64::STRHHui: 2648 case AArch64::STURHHi: 2649 return 2; 2650 case AArch64::LDRSui: 2651 case AArch64::LDURSi: 2652 case AArch64::LDRSWui: 2653 case AArch64::LDURSWi: 2654 case AArch64::LDRWui: 2655 case AArch64::LDURWi: 2656 case AArch64::STRSui: 2657 case AArch64::STURSi: 2658 case AArch64::STRWui: 2659 case AArch64::STURWi: 2660 case AArch64::LDPSi: 2661 case AArch64::LDPSWi: 2662 case AArch64::LDPWi: 2663 case AArch64::STPSi: 2664 case AArch64::STPWi: 2665 return 4; 2666 case AArch64::LDRDui: 2667 case AArch64::LDURDi: 2668 case AArch64::LDRXui: 2669 case AArch64::LDURXi: 2670 case AArch64::STRDui: 2671 case AArch64::STURDi: 2672 case AArch64::STRXui: 2673 case AArch64::STURXi: 2674 case AArch64::LDPDi: 2675 case AArch64::LDPXi: 2676 case AArch64::STPDi: 2677 case AArch64::STPXi: 2678 return 8; 2679 case AArch64::LDRQui: 2680 case AArch64::LDURQi: 2681 case AArch64::STRQui: 2682 case AArch64::STURQi: 2683 case AArch64::LDPQi: 2684 case AArch64::STPQi: 2685 case AArch64::STGOffset: 2686 case AArch64::STZGOffset: 2687 case AArch64::ST2GOffset: 2688 case AArch64::STZ2GOffset: 2689 case AArch64::STGPi: 2690 return 16; 2691 } 2692 } 2693 2694 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 2695 // scaled. 2696 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 2697 int Scale = AArch64InstrInfo::getMemScale(Opc); 2698 2699 // If the byte-offset isn't a multiple of the stride, we can't scale this 2700 // offset. 2701 if (Offset % Scale != 0) 2702 return false; 2703 2704 // Convert the byte-offset used by unscaled into an "element" offset used 2705 // by the scaled pair load/store instructions. 2706 Offset /= Scale; 2707 return true; 2708 } 2709 2710 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 2711 if (FirstOpc == SecondOpc) 2712 return true; 2713 // We can also pair sign-ext and zero-ext instructions. 2714 switch (FirstOpc) { 2715 default: 2716 return false; 2717 case AArch64::LDRWui: 2718 case AArch64::LDURWi: 2719 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 2720 case AArch64::LDRSWui: 2721 case AArch64::LDURSWi: 2722 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 2723 } 2724 // These instructions can't be paired based on their opcodes. 2725 return false; 2726 } 2727 2728 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 2729 int64_t Offset1, unsigned Opcode1, int FI2, 2730 int64_t Offset2, unsigned Opcode2) { 2731 // Accesses through fixed stack object frame indices may access a different 2732 // fixed stack slot. Check that the object offsets + offsets match. 2733 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 2734 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 2735 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 2736 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 2737 // Convert to scaled object offsets. 2738 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 2739 if (ObjectOffset1 % Scale1 != 0) 2740 return false; 2741 ObjectOffset1 /= Scale1; 2742 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 2743 if (ObjectOffset2 % Scale2 != 0) 2744 return false; 2745 ObjectOffset2 /= Scale2; 2746 ObjectOffset1 += Offset1; 2747 ObjectOffset2 += Offset2; 2748 return ObjectOffset1 + 1 == ObjectOffset2; 2749 } 2750 2751 return FI1 == FI2; 2752 } 2753 2754 /// Detect opportunities for ldp/stp formation. 2755 /// 2756 /// Only called for LdSt for which getMemOperandWithOffset returns true. 2757 bool AArch64InstrInfo::shouldClusterMemOps( 2758 ArrayRef<const MachineOperand *> BaseOps1, 2759 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 2760 unsigned NumBytes) const { 2761 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 2762 const MachineOperand &BaseOp1 = *BaseOps1.front(); 2763 const MachineOperand &BaseOp2 = *BaseOps2.front(); 2764 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 2765 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 2766 if (BaseOp1.getType() != BaseOp2.getType()) 2767 return false; 2768 2769 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 2770 "Only base registers and frame indices are supported."); 2771 2772 // Check for both base regs and base FI. 2773 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 2774 return false; 2775 2776 // Only cluster up to a single pair. 2777 if (NumLoads > 2) 2778 return false; 2779 2780 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 2781 return false; 2782 2783 // Can we pair these instructions based on their opcodes? 2784 unsigned FirstOpc = FirstLdSt.getOpcode(); 2785 unsigned SecondOpc = SecondLdSt.getOpcode(); 2786 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 2787 return false; 2788 2789 // Can't merge volatiles or load/stores that have a hint to avoid pair 2790 // formation, for example. 2791 if (!isCandidateToMergeOrPair(FirstLdSt) || 2792 !isCandidateToMergeOrPair(SecondLdSt)) 2793 return false; 2794 2795 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 2796 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 2797 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 2798 return false; 2799 2800 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 2801 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 2802 return false; 2803 2804 // Pairwise instructions have a 7-bit signed offset field. 2805 if (Offset1 > 63 || Offset1 < -64) 2806 return false; 2807 2808 // The caller should already have ordered First/SecondLdSt by offset. 2809 // Note: except for non-equal frame index bases 2810 if (BaseOp1.isFI()) { 2811 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 2812 "Caller should have ordered offsets."); 2813 2814 const MachineFrameInfo &MFI = 2815 FirstLdSt.getParent()->getParent()->getFrameInfo(); 2816 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 2817 BaseOp2.getIndex(), Offset2, SecondOpc); 2818 } 2819 2820 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 2821 2822 return Offset1 + 1 == Offset2; 2823 } 2824 2825 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 2826 unsigned Reg, unsigned SubIdx, 2827 unsigned State, 2828 const TargetRegisterInfo *TRI) { 2829 if (!SubIdx) 2830 return MIB.addReg(Reg, State); 2831 2832 if (Register::isPhysicalRegister(Reg)) 2833 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 2834 return MIB.addReg(Reg, State, SubIdx); 2835 } 2836 2837 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 2838 unsigned NumRegs) { 2839 // We really want the positive remainder mod 32 here, that happens to be 2840 // easily obtainable with a mask. 2841 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 2842 } 2843 2844 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 2845 MachineBasicBlock::iterator I, 2846 const DebugLoc &DL, MCRegister DestReg, 2847 MCRegister SrcReg, bool KillSrc, 2848 unsigned Opcode, 2849 ArrayRef<unsigned> Indices) const { 2850 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 2851 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2852 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2853 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2854 unsigned NumRegs = Indices.size(); 2855 2856 int SubReg = 0, End = NumRegs, Incr = 1; 2857 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 2858 SubReg = NumRegs - 1; 2859 End = -1; 2860 Incr = -1; 2861 } 2862 2863 for (; SubReg != End; SubReg += Incr) { 2864 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2865 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2866 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 2867 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2868 } 2869 } 2870 2871 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 2872 MachineBasicBlock::iterator I, 2873 DebugLoc DL, unsigned DestReg, 2874 unsigned SrcReg, bool KillSrc, 2875 unsigned Opcode, unsigned ZeroReg, 2876 llvm::ArrayRef<unsigned> Indices) const { 2877 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2878 unsigned NumRegs = Indices.size(); 2879 2880 #ifndef NDEBUG 2881 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2882 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2883 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 2884 "GPR reg sequences should not be able to overlap"); 2885 #endif 2886 2887 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 2888 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2889 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2890 MIB.addReg(ZeroReg); 2891 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2892 MIB.addImm(0); 2893 } 2894 } 2895 2896 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 2897 MachineBasicBlock::iterator I, 2898 const DebugLoc &DL, MCRegister DestReg, 2899 MCRegister SrcReg, bool KillSrc) const { 2900 if (AArch64::GPR32spRegClass.contains(DestReg) && 2901 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 2902 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2903 2904 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 2905 // If either operand is WSP, expand to ADD #0. 2906 if (Subtarget.hasZeroCycleRegMove()) { 2907 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 2908 MCRegister DestRegX = TRI->getMatchingSuperReg( 2909 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2910 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2911 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2912 // This instruction is reading and writing X registers. This may upset 2913 // the register scavenger and machine verifier, so we need to indicate 2914 // that we are reading an undefined value from SrcRegX, but a proper 2915 // value from SrcReg. 2916 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 2917 .addReg(SrcRegX, RegState::Undef) 2918 .addImm(0) 2919 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 2920 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2921 } else { 2922 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 2923 .addReg(SrcReg, getKillRegState(KillSrc)) 2924 .addImm(0) 2925 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2926 } 2927 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 2928 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 2929 .addImm(0) 2930 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2931 } else { 2932 if (Subtarget.hasZeroCycleRegMove()) { 2933 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 2934 MCRegister DestRegX = TRI->getMatchingSuperReg( 2935 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2936 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2937 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2938 // This instruction is reading and writing X registers. This may upset 2939 // the register scavenger and machine verifier, so we need to indicate 2940 // that we are reading an undefined value from SrcRegX, but a proper 2941 // value from SrcReg. 2942 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 2943 .addReg(AArch64::XZR) 2944 .addReg(SrcRegX, RegState::Undef) 2945 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2946 } else { 2947 // Otherwise, expand to ORR WZR. 2948 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 2949 .addReg(AArch64::WZR) 2950 .addReg(SrcReg, getKillRegState(KillSrc)); 2951 } 2952 } 2953 return; 2954 } 2955 2956 // Copy a Predicate register by ORRing with itself. 2957 if (AArch64::PPRRegClass.contains(DestReg) && 2958 AArch64::PPRRegClass.contains(SrcReg)) { 2959 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2960 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 2961 .addReg(SrcReg) // Pg 2962 .addReg(SrcReg) 2963 .addReg(SrcReg, getKillRegState(KillSrc)); 2964 return; 2965 } 2966 2967 // Copy a Z register by ORRing with itself. 2968 if (AArch64::ZPRRegClass.contains(DestReg) && 2969 AArch64::ZPRRegClass.contains(SrcReg)) { 2970 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2971 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 2972 .addReg(SrcReg) 2973 .addReg(SrcReg, getKillRegState(KillSrc)); 2974 return; 2975 } 2976 2977 // Copy a Z register pair by copying the individual sub-registers. 2978 if (AArch64::ZPR2RegClass.contains(DestReg) && 2979 AArch64::ZPR2RegClass.contains(SrcReg)) { 2980 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 2981 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 2982 Indices); 2983 return; 2984 } 2985 2986 // Copy a Z register triple by copying the individual sub-registers. 2987 if (AArch64::ZPR3RegClass.contains(DestReg) && 2988 AArch64::ZPR3RegClass.contains(SrcReg)) { 2989 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 2990 AArch64::zsub2}; 2991 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 2992 Indices); 2993 return; 2994 } 2995 2996 // Copy a Z register quad by copying the individual sub-registers. 2997 if (AArch64::ZPR4RegClass.contains(DestReg) && 2998 AArch64::ZPR4RegClass.contains(SrcReg)) { 2999 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3000 AArch64::zsub2, AArch64::zsub3}; 3001 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3002 Indices); 3003 return; 3004 } 3005 3006 if (AArch64::GPR64spRegClass.contains(DestReg) && 3007 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3008 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3009 // If either operand is SP, expand to ADD #0. 3010 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3011 .addReg(SrcReg, getKillRegState(KillSrc)) 3012 .addImm(0) 3013 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3014 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3015 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3016 .addImm(0) 3017 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3018 } else { 3019 // Otherwise, expand to ORR XZR. 3020 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3021 .addReg(AArch64::XZR) 3022 .addReg(SrcReg, getKillRegState(KillSrc)); 3023 } 3024 return; 3025 } 3026 3027 // Copy a DDDD register quad by copying the individual sub-registers. 3028 if (AArch64::DDDDRegClass.contains(DestReg) && 3029 AArch64::DDDDRegClass.contains(SrcReg)) { 3030 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3031 AArch64::dsub2, AArch64::dsub3}; 3032 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3033 Indices); 3034 return; 3035 } 3036 3037 // Copy a DDD register triple by copying the individual sub-registers. 3038 if (AArch64::DDDRegClass.contains(DestReg) && 3039 AArch64::DDDRegClass.contains(SrcReg)) { 3040 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3041 AArch64::dsub2}; 3042 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3043 Indices); 3044 return; 3045 } 3046 3047 // Copy a DD register pair by copying the individual sub-registers. 3048 if (AArch64::DDRegClass.contains(DestReg) && 3049 AArch64::DDRegClass.contains(SrcReg)) { 3050 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3051 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3052 Indices); 3053 return; 3054 } 3055 3056 // Copy a QQQQ register quad by copying the individual sub-registers. 3057 if (AArch64::QQQQRegClass.contains(DestReg) && 3058 AArch64::QQQQRegClass.contains(SrcReg)) { 3059 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3060 AArch64::qsub2, AArch64::qsub3}; 3061 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3062 Indices); 3063 return; 3064 } 3065 3066 // Copy a QQQ register triple by copying the individual sub-registers. 3067 if (AArch64::QQQRegClass.contains(DestReg) && 3068 AArch64::QQQRegClass.contains(SrcReg)) { 3069 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3070 AArch64::qsub2}; 3071 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3072 Indices); 3073 return; 3074 } 3075 3076 // Copy a QQ register pair by copying the individual sub-registers. 3077 if (AArch64::QQRegClass.contains(DestReg) && 3078 AArch64::QQRegClass.contains(SrcReg)) { 3079 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3080 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3081 Indices); 3082 return; 3083 } 3084 3085 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3086 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3087 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3088 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3089 AArch64::XZR, Indices); 3090 return; 3091 } 3092 3093 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3094 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3095 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3096 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3097 AArch64::WZR, Indices); 3098 return; 3099 } 3100 3101 if (AArch64::FPR128RegClass.contains(DestReg) && 3102 AArch64::FPR128RegClass.contains(SrcReg)) { 3103 if (Subtarget.hasNEON()) { 3104 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3105 .addReg(SrcReg) 3106 .addReg(SrcReg, getKillRegState(KillSrc)); 3107 } else { 3108 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3109 .addReg(AArch64::SP, RegState::Define) 3110 .addReg(SrcReg, getKillRegState(KillSrc)) 3111 .addReg(AArch64::SP) 3112 .addImm(-16); 3113 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3114 .addReg(AArch64::SP, RegState::Define) 3115 .addReg(DestReg, RegState::Define) 3116 .addReg(AArch64::SP) 3117 .addImm(16); 3118 } 3119 return; 3120 } 3121 3122 if (AArch64::FPR64RegClass.contains(DestReg) && 3123 AArch64::FPR64RegClass.contains(SrcReg)) { 3124 if (Subtarget.hasNEON()) { 3125 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 3126 &AArch64::FPR128RegClass); 3127 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 3128 &AArch64::FPR128RegClass); 3129 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3130 .addReg(SrcReg) 3131 .addReg(SrcReg, getKillRegState(KillSrc)); 3132 } else { 3133 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3134 .addReg(SrcReg, getKillRegState(KillSrc)); 3135 } 3136 return; 3137 } 3138 3139 if (AArch64::FPR32RegClass.contains(DestReg) && 3140 AArch64::FPR32RegClass.contains(SrcReg)) { 3141 if (Subtarget.hasNEON()) { 3142 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 3143 &AArch64::FPR128RegClass); 3144 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 3145 &AArch64::FPR128RegClass); 3146 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3147 .addReg(SrcReg) 3148 .addReg(SrcReg, getKillRegState(KillSrc)); 3149 } else { 3150 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3151 .addReg(SrcReg, getKillRegState(KillSrc)); 3152 } 3153 return; 3154 } 3155 3156 if (AArch64::FPR16RegClass.contains(DestReg) && 3157 AArch64::FPR16RegClass.contains(SrcReg)) { 3158 if (Subtarget.hasNEON()) { 3159 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3160 &AArch64::FPR128RegClass); 3161 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3162 &AArch64::FPR128RegClass); 3163 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3164 .addReg(SrcReg) 3165 .addReg(SrcReg, getKillRegState(KillSrc)); 3166 } else { 3167 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3168 &AArch64::FPR32RegClass); 3169 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3170 &AArch64::FPR32RegClass); 3171 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3172 .addReg(SrcReg, getKillRegState(KillSrc)); 3173 } 3174 return; 3175 } 3176 3177 if (AArch64::FPR8RegClass.contains(DestReg) && 3178 AArch64::FPR8RegClass.contains(SrcReg)) { 3179 if (Subtarget.hasNEON()) { 3180 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3181 &AArch64::FPR128RegClass); 3182 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3183 &AArch64::FPR128RegClass); 3184 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3185 .addReg(SrcReg) 3186 .addReg(SrcReg, getKillRegState(KillSrc)); 3187 } else { 3188 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3189 &AArch64::FPR32RegClass); 3190 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3191 &AArch64::FPR32RegClass); 3192 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3193 .addReg(SrcReg, getKillRegState(KillSrc)); 3194 } 3195 return; 3196 } 3197 3198 // Copies between GPR64 and FPR64. 3199 if (AArch64::FPR64RegClass.contains(DestReg) && 3200 AArch64::GPR64RegClass.contains(SrcReg)) { 3201 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3202 .addReg(SrcReg, getKillRegState(KillSrc)); 3203 return; 3204 } 3205 if (AArch64::GPR64RegClass.contains(DestReg) && 3206 AArch64::FPR64RegClass.contains(SrcReg)) { 3207 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3208 .addReg(SrcReg, getKillRegState(KillSrc)); 3209 return; 3210 } 3211 // Copies between GPR32 and FPR32. 3212 if (AArch64::FPR32RegClass.contains(DestReg) && 3213 AArch64::GPR32RegClass.contains(SrcReg)) { 3214 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3215 .addReg(SrcReg, getKillRegState(KillSrc)); 3216 return; 3217 } 3218 if (AArch64::GPR32RegClass.contains(DestReg) && 3219 AArch64::FPR32RegClass.contains(SrcReg)) { 3220 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3221 .addReg(SrcReg, getKillRegState(KillSrc)); 3222 return; 3223 } 3224 3225 if (DestReg == AArch64::NZCV) { 3226 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3227 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3228 .addImm(AArch64SysReg::NZCV) 3229 .addReg(SrcReg, getKillRegState(KillSrc)) 3230 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3231 return; 3232 } 3233 3234 if (SrcReg == AArch64::NZCV) { 3235 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3236 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3237 .addImm(AArch64SysReg::NZCV) 3238 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3239 return; 3240 } 3241 3242 llvm_unreachable("unimplemented reg-to-reg copy"); 3243 } 3244 3245 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3246 MachineBasicBlock &MBB, 3247 MachineBasicBlock::iterator InsertBefore, 3248 const MCInstrDesc &MCID, 3249 Register SrcReg, bool IsKill, 3250 unsigned SubIdx0, unsigned SubIdx1, int FI, 3251 MachineMemOperand *MMO) { 3252 Register SrcReg0 = SrcReg; 3253 Register SrcReg1 = SrcReg; 3254 if (Register::isPhysicalRegister(SrcReg)) { 3255 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3256 SubIdx0 = 0; 3257 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3258 SubIdx1 = 0; 3259 } 3260 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3261 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3262 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3263 .addFrameIndex(FI) 3264 .addImm(0) 3265 .addMemOperand(MMO); 3266 } 3267 3268 void AArch64InstrInfo::storeRegToStackSlot( 3269 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, 3270 bool isKill, int FI, const TargetRegisterClass *RC, 3271 const TargetRegisterInfo *TRI) const { 3272 MachineFunction &MF = *MBB.getParent(); 3273 MachineFrameInfo &MFI = MF.getFrameInfo(); 3274 3275 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3276 MachineMemOperand *MMO = 3277 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3278 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3279 unsigned Opc = 0; 3280 bool Offset = true; 3281 unsigned StackID = TargetStackID::Default; 3282 switch (TRI->getSpillSize(*RC)) { 3283 case 1: 3284 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3285 Opc = AArch64::STRBui; 3286 break; 3287 case 2: 3288 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3289 Opc = AArch64::STRHui; 3290 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3291 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3292 Opc = AArch64::STR_PXI; 3293 StackID = TargetStackID::ScalableVector; 3294 } 3295 break; 3296 case 4: 3297 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3298 Opc = AArch64::STRWui; 3299 if (Register::isVirtualRegister(SrcReg)) 3300 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3301 else 3302 assert(SrcReg != AArch64::WSP); 3303 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3304 Opc = AArch64::STRSui; 3305 break; 3306 case 8: 3307 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3308 Opc = AArch64::STRXui; 3309 if (Register::isVirtualRegister(SrcReg)) 3310 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3311 else 3312 assert(SrcReg != AArch64::SP); 3313 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3314 Opc = AArch64::STRDui; 3315 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3316 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3317 get(AArch64::STPWi), SrcReg, isKill, 3318 AArch64::sube32, AArch64::subo32, FI, MMO); 3319 return; 3320 } 3321 break; 3322 case 16: 3323 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3324 Opc = AArch64::STRQui; 3325 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3326 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3327 Opc = AArch64::ST1Twov1d; 3328 Offset = false; 3329 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3330 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3331 get(AArch64::STPXi), SrcReg, isKill, 3332 AArch64::sube64, AArch64::subo64, FI, MMO); 3333 return; 3334 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3335 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3336 Opc = AArch64::STR_ZXI; 3337 StackID = TargetStackID::ScalableVector; 3338 } 3339 break; 3340 case 24: 3341 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3342 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3343 Opc = AArch64::ST1Threev1d; 3344 Offset = false; 3345 } 3346 break; 3347 case 32: 3348 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3349 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3350 Opc = AArch64::ST1Fourv1d; 3351 Offset = false; 3352 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3353 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3354 Opc = AArch64::ST1Twov2d; 3355 Offset = false; 3356 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3357 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3358 Opc = AArch64::STR_ZZXI; 3359 StackID = TargetStackID::ScalableVector; 3360 } 3361 break; 3362 case 48: 3363 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3364 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3365 Opc = AArch64::ST1Threev2d; 3366 Offset = false; 3367 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3368 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3369 Opc = AArch64::STR_ZZZXI; 3370 StackID = TargetStackID::ScalableVector; 3371 } 3372 break; 3373 case 64: 3374 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3375 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3376 Opc = AArch64::ST1Fourv2d; 3377 Offset = false; 3378 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3379 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3380 Opc = AArch64::STR_ZZZZXI; 3381 StackID = TargetStackID::ScalableVector; 3382 } 3383 break; 3384 } 3385 assert(Opc && "Unknown register class"); 3386 MFI.setStackID(FI, StackID); 3387 3388 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3389 .addReg(SrcReg, getKillRegState(isKill)) 3390 .addFrameIndex(FI); 3391 3392 if (Offset) 3393 MI.addImm(0); 3394 MI.addMemOperand(MMO); 3395 } 3396 3397 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3398 MachineBasicBlock &MBB, 3399 MachineBasicBlock::iterator InsertBefore, 3400 const MCInstrDesc &MCID, 3401 Register DestReg, unsigned SubIdx0, 3402 unsigned SubIdx1, int FI, 3403 MachineMemOperand *MMO) { 3404 Register DestReg0 = DestReg; 3405 Register DestReg1 = DestReg; 3406 bool IsUndef = true; 3407 if (Register::isPhysicalRegister(DestReg)) { 3408 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3409 SubIdx0 = 0; 3410 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3411 SubIdx1 = 0; 3412 IsUndef = false; 3413 } 3414 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3415 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3416 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3417 .addFrameIndex(FI) 3418 .addImm(0) 3419 .addMemOperand(MMO); 3420 } 3421 3422 void AArch64InstrInfo::loadRegFromStackSlot( 3423 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 3424 int FI, const TargetRegisterClass *RC, 3425 const TargetRegisterInfo *TRI) const { 3426 MachineFunction &MF = *MBB.getParent(); 3427 MachineFrameInfo &MFI = MF.getFrameInfo(); 3428 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3429 MachineMemOperand *MMO = 3430 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3431 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3432 3433 unsigned Opc = 0; 3434 bool Offset = true; 3435 unsigned StackID = TargetStackID::Default; 3436 switch (TRI->getSpillSize(*RC)) { 3437 case 1: 3438 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3439 Opc = AArch64::LDRBui; 3440 break; 3441 case 2: 3442 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3443 Opc = AArch64::LDRHui; 3444 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3445 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3446 Opc = AArch64::LDR_PXI; 3447 StackID = TargetStackID::ScalableVector; 3448 } 3449 break; 3450 case 4: 3451 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3452 Opc = AArch64::LDRWui; 3453 if (Register::isVirtualRegister(DestReg)) 3454 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3455 else 3456 assert(DestReg != AArch64::WSP); 3457 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3458 Opc = AArch64::LDRSui; 3459 break; 3460 case 8: 3461 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3462 Opc = AArch64::LDRXui; 3463 if (Register::isVirtualRegister(DestReg)) 3464 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3465 else 3466 assert(DestReg != AArch64::SP); 3467 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3468 Opc = AArch64::LDRDui; 3469 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3470 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3471 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3472 AArch64::subo32, FI, MMO); 3473 return; 3474 } 3475 break; 3476 case 16: 3477 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3478 Opc = AArch64::LDRQui; 3479 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3480 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3481 Opc = AArch64::LD1Twov1d; 3482 Offset = false; 3483 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3484 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3485 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3486 AArch64::subo64, FI, MMO); 3487 return; 3488 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3489 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3490 Opc = AArch64::LDR_ZXI; 3491 StackID = TargetStackID::ScalableVector; 3492 } 3493 break; 3494 case 24: 3495 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3496 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3497 Opc = AArch64::LD1Threev1d; 3498 Offset = false; 3499 } 3500 break; 3501 case 32: 3502 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3503 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3504 Opc = AArch64::LD1Fourv1d; 3505 Offset = false; 3506 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3507 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3508 Opc = AArch64::LD1Twov2d; 3509 Offset = false; 3510 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3511 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3512 Opc = AArch64::LDR_ZZXI; 3513 StackID = TargetStackID::ScalableVector; 3514 } 3515 break; 3516 case 48: 3517 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3518 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3519 Opc = AArch64::LD1Threev2d; 3520 Offset = false; 3521 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3522 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3523 Opc = AArch64::LDR_ZZZXI; 3524 StackID = TargetStackID::ScalableVector; 3525 } 3526 break; 3527 case 64: 3528 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3529 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3530 Opc = AArch64::LD1Fourv2d; 3531 Offset = false; 3532 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3533 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3534 Opc = AArch64::LDR_ZZZZXI; 3535 StackID = TargetStackID::ScalableVector; 3536 } 3537 break; 3538 } 3539 3540 assert(Opc && "Unknown register class"); 3541 MFI.setStackID(FI, StackID); 3542 3543 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3544 .addReg(DestReg, getDefRegState(true)) 3545 .addFrameIndex(FI); 3546 if (Offset) 3547 MI.addImm(0); 3548 MI.addMemOperand(MMO); 3549 } 3550 3551 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 3552 const MachineInstr &UseMI, 3553 const TargetRegisterInfo *TRI) { 3554 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 3555 UseMI.getIterator()), 3556 [TRI](const MachineInstr &I) { 3557 return I.modifiesRegister(AArch64::NZCV, TRI) || 3558 I.readsRegister(AArch64::NZCV, TRI); 3559 }); 3560 } 3561 3562 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 3563 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 3564 // The smallest scalable element supported by scaled SVE addressing 3565 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3566 // byte offset must always be a multiple of 2. 3567 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3568 3569 // VGSized offsets are divided by '2', because the VG register is the 3570 // the number of 64bit granules as opposed to 128bit vector chunks, 3571 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 3572 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 3573 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 3574 ByteSized = Offset.getFixed(); 3575 VGSized = Offset.getScalable() / 2; 3576 } 3577 3578 /// Returns the offset in parts to which this frame offset can be 3579 /// decomposed for the purpose of describing a frame offset. 3580 /// For non-scalable offsets this is simply its byte size. 3581 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 3582 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 3583 int64_t &NumDataVectors) { 3584 // The smallest scalable element supported by scaled SVE addressing 3585 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3586 // byte offset must always be a multiple of 2. 3587 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3588 3589 NumBytes = Offset.getFixed(); 3590 NumDataVectors = 0; 3591 NumPredicateVectors = Offset.getScalable() / 2; 3592 // This method is used to get the offsets to adjust the frame offset. 3593 // If the function requires ADDPL to be used and needs more than two ADDPL 3594 // instructions, part of the offset is folded into NumDataVectors so that it 3595 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 3596 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 3597 NumPredicateVectors > 62) { 3598 NumDataVectors = NumPredicateVectors / 8; 3599 NumPredicateVectors -= NumDataVectors * 8; 3600 } 3601 } 3602 3603 // Helper function to emit a frame offset adjustment from a given 3604 // pointer (SrcReg), stored into DestReg. This function is explicit 3605 // in that it requires the opcode. 3606 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3607 MachineBasicBlock::iterator MBBI, 3608 const DebugLoc &DL, unsigned DestReg, 3609 unsigned SrcReg, int64_t Offset, unsigned Opc, 3610 const TargetInstrInfo *TII, 3611 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3612 bool *HasWinCFI) { 3613 int Sign = 1; 3614 unsigned MaxEncoding, ShiftSize; 3615 switch (Opc) { 3616 case AArch64::ADDXri: 3617 case AArch64::ADDSXri: 3618 case AArch64::SUBXri: 3619 case AArch64::SUBSXri: 3620 MaxEncoding = 0xfff; 3621 ShiftSize = 12; 3622 break; 3623 case AArch64::ADDVL_XXI: 3624 case AArch64::ADDPL_XXI: 3625 MaxEncoding = 31; 3626 ShiftSize = 0; 3627 if (Offset < 0) { 3628 MaxEncoding = 32; 3629 Sign = -1; 3630 Offset = -Offset; 3631 } 3632 break; 3633 default: 3634 llvm_unreachable("Unsupported opcode"); 3635 } 3636 3637 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3638 // scratch register. If DestReg is a virtual register, use it as the 3639 // scratch register; otherwise, create a new virtual register (to be 3640 // replaced by the scavenger at the end of PEI). That case can be optimized 3641 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3642 // register can be loaded with offset%8 and the add/sub can use an extending 3643 // instruction with LSL#3. 3644 // Currently the function handles any offsets but generates a poor sequence 3645 // of code. 3646 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3647 3648 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3649 Register TmpReg = DestReg; 3650 if (TmpReg == AArch64::XZR) 3651 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 3652 &AArch64::GPR64RegClass); 3653 do { 3654 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 3655 unsigned LocalShiftSize = 0; 3656 if (ThisVal > MaxEncoding) { 3657 ThisVal = ThisVal >> ShiftSize; 3658 LocalShiftSize = ShiftSize; 3659 } 3660 assert((ThisVal >> ShiftSize) <= MaxEncoding && 3661 "Encoding cannot handle value that big"); 3662 3663 Offset -= ThisVal << LocalShiftSize; 3664 if (Offset == 0) 3665 TmpReg = DestReg; 3666 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 3667 .addReg(SrcReg) 3668 .addImm(Sign * (int)ThisVal); 3669 if (ShiftSize) 3670 MBI = MBI.addImm( 3671 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 3672 MBI = MBI.setMIFlag(Flag); 3673 3674 if (NeedsWinCFI) { 3675 assert(Sign == 1 && "SEH directives should always have a positive sign"); 3676 int Imm = (int)(ThisVal << LocalShiftSize); 3677 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 3678 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 3679 if (HasWinCFI) 3680 *HasWinCFI = true; 3681 if (Imm == 0) 3682 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 3683 else 3684 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 3685 .addImm(Imm) 3686 .setMIFlag(Flag); 3687 assert(Offset == 0 && "Expected remaining offset to be zero to " 3688 "emit a single SEH directive"); 3689 } else if (DestReg == AArch64::SP) { 3690 if (HasWinCFI) 3691 *HasWinCFI = true; 3692 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 3693 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 3694 .addImm(Imm) 3695 .setMIFlag(Flag); 3696 } 3697 if (HasWinCFI) 3698 *HasWinCFI = true; 3699 } 3700 3701 SrcReg = TmpReg; 3702 } while (Offset); 3703 } 3704 3705 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 3706 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 3707 unsigned DestReg, unsigned SrcReg, 3708 StackOffset Offset, const TargetInstrInfo *TII, 3709 MachineInstr::MIFlag Flag, bool SetNZCV, 3710 bool NeedsWinCFI, bool *HasWinCFI) { 3711 int64_t Bytes, NumPredicateVectors, NumDataVectors; 3712 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 3713 Offset, Bytes, NumPredicateVectors, NumDataVectors); 3714 3715 // First emit non-scalable frame offsets, or a simple 'mov'. 3716 if (Bytes || (!Offset && SrcReg != DestReg)) { 3717 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 3718 "SP increment/decrement not 8-byte aligned"); 3719 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 3720 if (Bytes < 0) { 3721 Bytes = -Bytes; 3722 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 3723 } 3724 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 3725 NeedsWinCFI, HasWinCFI); 3726 SrcReg = DestReg; 3727 } 3728 3729 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 3730 "SetNZCV not supported with SVE vectors"); 3731 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 3732 "WinCFI not supported with SVE vectors"); 3733 3734 if (NumDataVectors) { 3735 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 3736 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3737 SrcReg = DestReg; 3738 } 3739 3740 if (NumPredicateVectors) { 3741 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 3742 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 3743 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3744 } 3745 } 3746 3747 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 3748 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 3749 MachineBasicBlock::iterator InsertPt, int FrameIndex, 3750 LiveIntervals *LIS, VirtRegMap *VRM) const { 3751 // This is a bit of a hack. Consider this instruction: 3752 // 3753 // %0 = COPY %sp; GPR64all:%0 3754 // 3755 // We explicitly chose GPR64all for the virtual register so such a copy might 3756 // be eliminated by RegisterCoalescer. However, that may not be possible, and 3757 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 3758 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 3759 // 3760 // To prevent that, we are going to constrain the %0 register class here. 3761 // 3762 // <rdar://problem/11522048> 3763 // 3764 if (MI.isFullCopy()) { 3765 Register DstReg = MI.getOperand(0).getReg(); 3766 Register SrcReg = MI.getOperand(1).getReg(); 3767 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 3768 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 3769 return nullptr; 3770 } 3771 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 3772 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3773 return nullptr; 3774 } 3775 } 3776 3777 // Handle the case where a copy is being spilled or filled but the source 3778 // and destination register class don't match. For example: 3779 // 3780 // %0 = COPY %xzr; GPR64common:%0 3781 // 3782 // In this case we can still safely fold away the COPY and generate the 3783 // following spill code: 3784 // 3785 // STRXui %xzr, %stack.0 3786 // 3787 // This also eliminates spilled cross register class COPYs (e.g. between x and 3788 // d regs) of the same size. For example: 3789 // 3790 // %0 = COPY %1; GPR64:%0, FPR64:%1 3791 // 3792 // will be filled as 3793 // 3794 // LDRDui %0, fi<#0> 3795 // 3796 // instead of 3797 // 3798 // LDRXui %Temp, fi<#0> 3799 // %0 = FMOV %Temp 3800 // 3801 if (MI.isCopy() && Ops.size() == 1 && 3802 // Make sure we're only folding the explicit COPY defs/uses. 3803 (Ops[0] == 0 || Ops[0] == 1)) { 3804 bool IsSpill = Ops[0] == 0; 3805 bool IsFill = !IsSpill; 3806 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 3807 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3808 MachineBasicBlock &MBB = *MI.getParent(); 3809 const MachineOperand &DstMO = MI.getOperand(0); 3810 const MachineOperand &SrcMO = MI.getOperand(1); 3811 Register DstReg = DstMO.getReg(); 3812 Register SrcReg = SrcMO.getReg(); 3813 // This is slightly expensive to compute for physical regs since 3814 // getMinimalPhysRegClass is slow. 3815 auto getRegClass = [&](unsigned Reg) { 3816 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 3817 : TRI.getMinimalPhysRegClass(Reg); 3818 }; 3819 3820 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 3821 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 3822 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 3823 "Mismatched register size in non subreg COPY"); 3824 if (IsSpill) 3825 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 3826 getRegClass(SrcReg), &TRI); 3827 else 3828 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 3829 getRegClass(DstReg), &TRI); 3830 return &*--InsertPt; 3831 } 3832 3833 // Handle cases like spilling def of: 3834 // 3835 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 3836 // 3837 // where the physical register source can be widened and stored to the full 3838 // virtual reg destination stack slot, in this case producing: 3839 // 3840 // STRXui %xzr, %stack.0 3841 // 3842 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 3843 assert(SrcMO.getSubReg() == 0 && 3844 "Unexpected subreg on physical register"); 3845 const TargetRegisterClass *SpillRC; 3846 unsigned SpillSubreg; 3847 switch (DstMO.getSubReg()) { 3848 default: 3849 SpillRC = nullptr; 3850 break; 3851 case AArch64::sub_32: 3852 case AArch64::ssub: 3853 if (AArch64::GPR32RegClass.contains(SrcReg)) { 3854 SpillRC = &AArch64::GPR64RegClass; 3855 SpillSubreg = AArch64::sub_32; 3856 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 3857 SpillRC = &AArch64::FPR64RegClass; 3858 SpillSubreg = AArch64::ssub; 3859 } else 3860 SpillRC = nullptr; 3861 break; 3862 case AArch64::dsub: 3863 if (AArch64::FPR64RegClass.contains(SrcReg)) { 3864 SpillRC = &AArch64::FPR128RegClass; 3865 SpillSubreg = AArch64::dsub; 3866 } else 3867 SpillRC = nullptr; 3868 break; 3869 } 3870 3871 if (SpillRC) 3872 if (unsigned WidenedSrcReg = 3873 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 3874 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 3875 FrameIndex, SpillRC, &TRI); 3876 return &*--InsertPt; 3877 } 3878 } 3879 3880 // Handle cases like filling use of: 3881 // 3882 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 3883 // 3884 // where we can load the full virtual reg source stack slot, into the subreg 3885 // destination, in this case producing: 3886 // 3887 // LDRWui %0:sub_32<def,read-undef>, %stack.0 3888 // 3889 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 3890 const TargetRegisterClass *FillRC; 3891 switch (DstMO.getSubReg()) { 3892 default: 3893 FillRC = nullptr; 3894 break; 3895 case AArch64::sub_32: 3896 FillRC = &AArch64::GPR32RegClass; 3897 break; 3898 case AArch64::ssub: 3899 FillRC = &AArch64::FPR32RegClass; 3900 break; 3901 case AArch64::dsub: 3902 FillRC = &AArch64::FPR64RegClass; 3903 break; 3904 } 3905 3906 if (FillRC) { 3907 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 3908 TRI.getRegSizeInBits(*FillRC) && 3909 "Mismatched regclass size on folded subreg COPY"); 3910 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 3911 MachineInstr &LoadMI = *--InsertPt; 3912 MachineOperand &LoadDst = LoadMI.getOperand(0); 3913 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 3914 LoadDst.setSubReg(DstMO.getSubReg()); 3915 LoadDst.setIsUndef(); 3916 return &LoadMI; 3917 } 3918 } 3919 } 3920 3921 // Cannot fold. 3922 return nullptr; 3923 } 3924 3925 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 3926 StackOffset &SOffset, 3927 bool *OutUseUnscaledOp, 3928 unsigned *OutUnscaledOp, 3929 int64_t *EmittableOffset) { 3930 // Set output values in case of early exit. 3931 if (EmittableOffset) 3932 *EmittableOffset = 0; 3933 if (OutUseUnscaledOp) 3934 *OutUseUnscaledOp = false; 3935 if (OutUnscaledOp) 3936 *OutUnscaledOp = 0; 3937 3938 // Exit early for structured vector spills/fills as they can't take an 3939 // immediate offset. 3940 switch (MI.getOpcode()) { 3941 default: 3942 break; 3943 case AArch64::LD1Twov2d: 3944 case AArch64::LD1Threev2d: 3945 case AArch64::LD1Fourv2d: 3946 case AArch64::LD1Twov1d: 3947 case AArch64::LD1Threev1d: 3948 case AArch64::LD1Fourv1d: 3949 case AArch64::ST1Twov2d: 3950 case AArch64::ST1Threev2d: 3951 case AArch64::ST1Fourv2d: 3952 case AArch64::ST1Twov1d: 3953 case AArch64::ST1Threev1d: 3954 case AArch64::ST1Fourv1d: 3955 case AArch64::IRG: 3956 case AArch64::IRGstack: 3957 case AArch64::STGloop: 3958 case AArch64::STZGloop: 3959 return AArch64FrameOffsetCannotUpdate; 3960 } 3961 3962 // Get the min/max offset and the scale. 3963 TypeSize ScaleValue(0U, false); 3964 unsigned Width; 3965 int64_t MinOff, MaxOff; 3966 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 3967 MaxOff)) 3968 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3969 3970 // Construct the complete offset. 3971 bool IsMulVL = ScaleValue.isScalable(); 3972 unsigned Scale = ScaleValue.getKnownMinSize(); 3973 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 3974 3975 const MachineOperand &ImmOpnd = 3976 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 3977 Offset += ImmOpnd.getImm() * Scale; 3978 3979 // If the offset doesn't match the scale, we rewrite the instruction to 3980 // use the unscaled instruction instead. Likewise, if we have a negative 3981 // offset and there is an unscaled op to use. 3982 Optional<unsigned> UnscaledOp = 3983 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 3984 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 3985 if (useUnscaledOp && 3986 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 3987 MaxOff)) 3988 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3989 3990 Scale = ScaleValue.getKnownMinSize(); 3991 assert(IsMulVL == ScaleValue.isScalable() && 3992 "Unscaled opcode has different value for scalable"); 3993 3994 int64_t Remainder = Offset % Scale; 3995 assert(!(Remainder && useUnscaledOp) && 3996 "Cannot have remainder when using unscaled op"); 3997 3998 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 3999 int64_t NewOffset = Offset / Scale; 4000 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4001 Offset = Remainder; 4002 else { 4003 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4004 Offset = Offset - NewOffset * Scale + Remainder; 4005 } 4006 4007 if (EmittableOffset) 4008 *EmittableOffset = NewOffset; 4009 if (OutUseUnscaledOp) 4010 *OutUseUnscaledOp = useUnscaledOp; 4011 if (OutUnscaledOp && UnscaledOp) 4012 *OutUnscaledOp = *UnscaledOp; 4013 4014 if (IsMulVL) 4015 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4016 else 4017 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4018 return AArch64FrameOffsetCanUpdate | 4019 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4020 } 4021 4022 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4023 unsigned FrameReg, StackOffset &Offset, 4024 const AArch64InstrInfo *TII) { 4025 unsigned Opcode = MI.getOpcode(); 4026 unsigned ImmIdx = FrameRegIdx + 1; 4027 4028 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4029 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4030 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4031 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4032 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4033 MI.eraseFromParent(); 4034 Offset = StackOffset(); 4035 return true; 4036 } 4037 4038 int64_t NewOffset; 4039 unsigned UnscaledOp; 4040 bool UseUnscaledOp; 4041 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4042 &UnscaledOp, &NewOffset); 4043 if (Status & AArch64FrameOffsetCanUpdate) { 4044 if (Status & AArch64FrameOffsetIsLegal) 4045 // Replace the FrameIndex with FrameReg. 4046 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4047 if (UseUnscaledOp) 4048 MI.setDesc(TII->get(UnscaledOp)); 4049 4050 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4051 return !Offset; 4052 } 4053 4054 return false; 4055 } 4056 4057 void AArch64InstrInfo::getNoop(MCInst &NopInst) const { 4058 NopInst.setOpcode(AArch64::HINT); 4059 NopInst.addOperand(MCOperand::createImm(0)); 4060 } 4061 4062 // AArch64 supports MachineCombiner. 4063 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4064 4065 // True when Opc sets flag 4066 static bool isCombineInstrSettingFlag(unsigned Opc) { 4067 switch (Opc) { 4068 case AArch64::ADDSWrr: 4069 case AArch64::ADDSWri: 4070 case AArch64::ADDSXrr: 4071 case AArch64::ADDSXri: 4072 case AArch64::SUBSWrr: 4073 case AArch64::SUBSXrr: 4074 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4075 case AArch64::SUBSWri: 4076 case AArch64::SUBSXri: 4077 return true; 4078 default: 4079 break; 4080 } 4081 return false; 4082 } 4083 4084 // 32b Opcodes that can be combined with a MUL 4085 static bool isCombineInstrCandidate32(unsigned Opc) { 4086 switch (Opc) { 4087 case AArch64::ADDWrr: 4088 case AArch64::ADDWri: 4089 case AArch64::SUBWrr: 4090 case AArch64::ADDSWrr: 4091 case AArch64::ADDSWri: 4092 case AArch64::SUBSWrr: 4093 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4094 case AArch64::SUBWri: 4095 case AArch64::SUBSWri: 4096 return true; 4097 default: 4098 break; 4099 } 4100 return false; 4101 } 4102 4103 // 64b Opcodes that can be combined with a MUL 4104 static bool isCombineInstrCandidate64(unsigned Opc) { 4105 switch (Opc) { 4106 case AArch64::ADDXrr: 4107 case AArch64::ADDXri: 4108 case AArch64::SUBXrr: 4109 case AArch64::ADDSXrr: 4110 case AArch64::ADDSXri: 4111 case AArch64::SUBSXrr: 4112 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4113 case AArch64::SUBXri: 4114 case AArch64::SUBSXri: 4115 case AArch64::ADDv8i8: 4116 case AArch64::ADDv16i8: 4117 case AArch64::ADDv4i16: 4118 case AArch64::ADDv8i16: 4119 case AArch64::ADDv2i32: 4120 case AArch64::ADDv4i32: 4121 case AArch64::SUBv8i8: 4122 case AArch64::SUBv16i8: 4123 case AArch64::SUBv4i16: 4124 case AArch64::SUBv8i16: 4125 case AArch64::SUBv2i32: 4126 case AArch64::SUBv4i32: 4127 return true; 4128 default: 4129 break; 4130 } 4131 return false; 4132 } 4133 4134 // FP Opcodes that can be combined with a FMUL. 4135 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4136 switch (Inst.getOpcode()) { 4137 default: 4138 break; 4139 case AArch64::FADDHrr: 4140 case AArch64::FADDSrr: 4141 case AArch64::FADDDrr: 4142 case AArch64::FADDv4f16: 4143 case AArch64::FADDv8f16: 4144 case AArch64::FADDv2f32: 4145 case AArch64::FADDv2f64: 4146 case AArch64::FADDv4f32: 4147 case AArch64::FSUBHrr: 4148 case AArch64::FSUBSrr: 4149 case AArch64::FSUBDrr: 4150 case AArch64::FSUBv4f16: 4151 case AArch64::FSUBv8f16: 4152 case AArch64::FSUBv2f32: 4153 case AArch64::FSUBv2f64: 4154 case AArch64::FSUBv4f32: 4155 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4156 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4157 // the target options or if FADD/FSUB has the contract fast-math flag. 4158 return Options.UnsafeFPMath || 4159 Options.AllowFPOpFusion == FPOpFusion::Fast || 4160 Inst.getFlag(MachineInstr::FmContract); 4161 return true; 4162 } 4163 return false; 4164 } 4165 4166 // Opcodes that can be combined with a MUL 4167 static bool isCombineInstrCandidate(unsigned Opc) { 4168 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4169 } 4170 4171 // 4172 // Utility routine that checks if \param MO is defined by an 4173 // \param CombineOpc instruction in the basic block \param MBB 4174 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4175 unsigned CombineOpc, unsigned ZeroReg = 0, 4176 bool CheckZeroReg = false) { 4177 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4178 MachineInstr *MI = nullptr; 4179 4180 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 4181 MI = MRI.getUniqueVRegDef(MO.getReg()); 4182 // And it needs to be in the trace (otherwise, it won't have a depth). 4183 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4184 return false; 4185 // Must only used by the user we combine with. 4186 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4187 return false; 4188 4189 if (CheckZeroReg) { 4190 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4191 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4192 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4193 // The third input reg must be zero. 4194 if (MI->getOperand(3).getReg() != ZeroReg) 4195 return false; 4196 } 4197 4198 return true; 4199 } 4200 4201 // 4202 // Is \param MO defined by an integer multiply and can be combined? 4203 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4204 unsigned MulOpc, unsigned ZeroReg) { 4205 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4206 } 4207 4208 // 4209 // Is \param MO defined by a floating-point multiply and can be combined? 4210 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4211 unsigned MulOpc) { 4212 return canCombine(MBB, MO, MulOpc); 4213 } 4214 4215 // TODO: There are many more machine instruction opcodes to match: 4216 // 1. Other data types (integer, vectors) 4217 // 2. Other math / logic operations (xor, or) 4218 // 3. Other forms of the same operation (intrinsics and other variants) 4219 bool AArch64InstrInfo::isAssociativeAndCommutative( 4220 const MachineInstr &Inst) const { 4221 switch (Inst.getOpcode()) { 4222 case AArch64::FADDDrr: 4223 case AArch64::FADDSrr: 4224 case AArch64::FADDv2f32: 4225 case AArch64::FADDv2f64: 4226 case AArch64::FADDv4f32: 4227 case AArch64::FMULDrr: 4228 case AArch64::FMULSrr: 4229 case AArch64::FMULX32: 4230 case AArch64::FMULX64: 4231 case AArch64::FMULXv2f32: 4232 case AArch64::FMULXv2f64: 4233 case AArch64::FMULXv4f32: 4234 case AArch64::FMULv2f32: 4235 case AArch64::FMULv2f64: 4236 case AArch64::FMULv4f32: 4237 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 4238 default: 4239 return false; 4240 } 4241 } 4242 4243 /// Find instructions that can be turned into madd. 4244 static bool getMaddPatterns(MachineInstr &Root, 4245 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4246 unsigned Opc = Root.getOpcode(); 4247 MachineBasicBlock &MBB = *Root.getParent(); 4248 bool Found = false; 4249 4250 if (!isCombineInstrCandidate(Opc)) 4251 return false; 4252 if (isCombineInstrSettingFlag(Opc)) { 4253 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 4254 // When NZCV is live bail out. 4255 if (Cmp_NZCV == -1) 4256 return false; 4257 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 4258 // When opcode can't change bail out. 4259 // CHECKME: do we miss any cases for opcode conversion? 4260 if (NewOpc == Opc) 4261 return false; 4262 Opc = NewOpc; 4263 } 4264 4265 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 4266 MachineCombinerPattern Pattern) { 4267 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 4268 Patterns.push_back(Pattern); 4269 Found = true; 4270 } 4271 }; 4272 4273 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 4274 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 4275 Patterns.push_back(Pattern); 4276 Found = true; 4277 } 4278 }; 4279 4280 typedef MachineCombinerPattern MCP; 4281 4282 switch (Opc) { 4283 default: 4284 break; 4285 case AArch64::ADDWrr: 4286 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4287 "ADDWrr does not have register operands"); 4288 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 4289 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 4290 break; 4291 case AArch64::ADDXrr: 4292 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 4293 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 4294 break; 4295 case AArch64::SUBWrr: 4296 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 4297 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 4298 break; 4299 case AArch64::SUBXrr: 4300 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 4301 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 4302 break; 4303 case AArch64::ADDWri: 4304 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 4305 break; 4306 case AArch64::ADDXri: 4307 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 4308 break; 4309 case AArch64::SUBWri: 4310 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 4311 break; 4312 case AArch64::SUBXri: 4313 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 4314 break; 4315 case AArch64::ADDv8i8: 4316 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 4317 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 4318 break; 4319 case AArch64::ADDv16i8: 4320 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 4321 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 4322 break; 4323 case AArch64::ADDv4i16: 4324 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 4325 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 4326 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 4327 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 4328 break; 4329 case AArch64::ADDv8i16: 4330 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 4331 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 4332 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 4333 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 4334 break; 4335 case AArch64::ADDv2i32: 4336 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 4337 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 4338 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 4339 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 4340 break; 4341 case AArch64::ADDv4i32: 4342 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 4343 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 4344 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 4345 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 4346 break; 4347 case AArch64::SUBv8i8: 4348 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 4349 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 4350 break; 4351 case AArch64::SUBv16i8: 4352 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 4353 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 4354 break; 4355 case AArch64::SUBv4i16: 4356 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 4357 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 4358 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 4359 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 4360 break; 4361 case AArch64::SUBv8i16: 4362 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 4363 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 4364 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 4365 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 4366 break; 4367 case AArch64::SUBv2i32: 4368 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 4369 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 4370 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 4371 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 4372 break; 4373 case AArch64::SUBv4i32: 4374 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 4375 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 4376 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 4377 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 4378 break; 4379 } 4380 return Found; 4381 } 4382 /// Floating-Point Support 4383 4384 /// Find instructions that can be turned into madd. 4385 static bool getFMAPatterns(MachineInstr &Root, 4386 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4387 4388 if (!isCombineInstrCandidateFP(Root)) 4389 return false; 4390 4391 MachineBasicBlock &MBB = *Root.getParent(); 4392 bool Found = false; 4393 4394 auto Match = [&](int Opcode, int Operand, 4395 MachineCombinerPattern Pattern) -> bool { 4396 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 4397 Patterns.push_back(Pattern); 4398 return true; 4399 } 4400 return false; 4401 }; 4402 4403 typedef MachineCombinerPattern MCP; 4404 4405 switch (Root.getOpcode()) { 4406 default: 4407 assert(false && "Unsupported FP instruction in combiner\n"); 4408 break; 4409 case AArch64::FADDHrr: 4410 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4411 "FADDHrr does not have register operands"); 4412 4413 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 4414 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 4415 break; 4416 case AArch64::FADDSrr: 4417 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4418 "FADDSrr does not have register operands"); 4419 4420 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 4421 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 4422 4423 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 4424 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 4425 break; 4426 case AArch64::FADDDrr: 4427 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 4428 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 4429 4430 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 4431 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 4432 break; 4433 case AArch64::FADDv4f16: 4434 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 4435 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 4436 4437 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 4438 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 4439 break; 4440 case AArch64::FADDv8f16: 4441 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 4442 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 4443 4444 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 4445 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 4446 break; 4447 case AArch64::FADDv2f32: 4448 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 4449 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 4450 4451 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 4452 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 4453 break; 4454 case AArch64::FADDv2f64: 4455 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 4456 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 4457 4458 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 4459 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 4460 break; 4461 case AArch64::FADDv4f32: 4462 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 4463 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 4464 4465 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 4466 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 4467 break; 4468 case AArch64::FSUBHrr: 4469 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 4470 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 4471 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 4472 break; 4473 case AArch64::FSUBSrr: 4474 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 4475 4476 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 4477 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 4478 4479 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 4480 break; 4481 case AArch64::FSUBDrr: 4482 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 4483 4484 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 4485 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 4486 4487 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 4488 break; 4489 case AArch64::FSUBv4f16: 4490 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 4491 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 4492 4493 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 4494 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 4495 break; 4496 case AArch64::FSUBv8f16: 4497 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 4498 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 4499 4500 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 4501 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 4502 break; 4503 case AArch64::FSUBv2f32: 4504 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 4505 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4506 4507 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4508 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4509 break; 4510 case AArch64::FSUBv2f64: 4511 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4512 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4513 4514 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4515 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4516 break; 4517 case AArch64::FSUBv4f32: 4518 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4519 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4520 4521 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4522 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4523 break; 4524 } 4525 return Found; 4526 } 4527 4528 /// Return true when a code sequence can improve throughput. It 4529 /// should be called only for instructions in loops. 4530 /// \param Pattern - combiner pattern 4531 bool AArch64InstrInfo::isThroughputPattern( 4532 MachineCombinerPattern Pattern) const { 4533 switch (Pattern) { 4534 default: 4535 break; 4536 case MachineCombinerPattern::FMULADDH_OP1: 4537 case MachineCombinerPattern::FMULADDH_OP2: 4538 case MachineCombinerPattern::FMULSUBH_OP1: 4539 case MachineCombinerPattern::FMULSUBH_OP2: 4540 case MachineCombinerPattern::FMULADDS_OP1: 4541 case MachineCombinerPattern::FMULADDS_OP2: 4542 case MachineCombinerPattern::FMULSUBS_OP1: 4543 case MachineCombinerPattern::FMULSUBS_OP2: 4544 case MachineCombinerPattern::FMULADDD_OP1: 4545 case MachineCombinerPattern::FMULADDD_OP2: 4546 case MachineCombinerPattern::FMULSUBD_OP1: 4547 case MachineCombinerPattern::FMULSUBD_OP2: 4548 case MachineCombinerPattern::FNMULSUBH_OP1: 4549 case MachineCombinerPattern::FNMULSUBS_OP1: 4550 case MachineCombinerPattern::FNMULSUBD_OP1: 4551 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4552 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4553 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4554 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4555 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4556 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4557 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4558 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4559 case MachineCombinerPattern::FMLAv4f16_OP2: 4560 case MachineCombinerPattern::FMLAv4f16_OP1: 4561 case MachineCombinerPattern::FMLAv8f16_OP1: 4562 case MachineCombinerPattern::FMLAv8f16_OP2: 4563 case MachineCombinerPattern::FMLAv2f32_OP2: 4564 case MachineCombinerPattern::FMLAv2f32_OP1: 4565 case MachineCombinerPattern::FMLAv2f64_OP1: 4566 case MachineCombinerPattern::FMLAv2f64_OP2: 4567 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4568 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4569 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4570 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4571 case MachineCombinerPattern::FMLAv4f32_OP1: 4572 case MachineCombinerPattern::FMLAv4f32_OP2: 4573 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4574 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4575 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4576 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4577 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4578 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4579 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4580 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4581 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4582 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4583 case MachineCombinerPattern::FMLSv4f16_OP1: 4584 case MachineCombinerPattern::FMLSv4f16_OP2: 4585 case MachineCombinerPattern::FMLSv8f16_OP1: 4586 case MachineCombinerPattern::FMLSv8f16_OP2: 4587 case MachineCombinerPattern::FMLSv2f32_OP2: 4588 case MachineCombinerPattern::FMLSv2f64_OP2: 4589 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4590 case MachineCombinerPattern::FMLSv4f32_OP2: 4591 case MachineCombinerPattern::MULADDv8i8_OP1: 4592 case MachineCombinerPattern::MULADDv8i8_OP2: 4593 case MachineCombinerPattern::MULADDv16i8_OP1: 4594 case MachineCombinerPattern::MULADDv16i8_OP2: 4595 case MachineCombinerPattern::MULADDv4i16_OP1: 4596 case MachineCombinerPattern::MULADDv4i16_OP2: 4597 case MachineCombinerPattern::MULADDv8i16_OP1: 4598 case MachineCombinerPattern::MULADDv8i16_OP2: 4599 case MachineCombinerPattern::MULADDv2i32_OP1: 4600 case MachineCombinerPattern::MULADDv2i32_OP2: 4601 case MachineCombinerPattern::MULADDv4i32_OP1: 4602 case MachineCombinerPattern::MULADDv4i32_OP2: 4603 case MachineCombinerPattern::MULSUBv8i8_OP1: 4604 case MachineCombinerPattern::MULSUBv8i8_OP2: 4605 case MachineCombinerPattern::MULSUBv16i8_OP1: 4606 case MachineCombinerPattern::MULSUBv16i8_OP2: 4607 case MachineCombinerPattern::MULSUBv4i16_OP1: 4608 case MachineCombinerPattern::MULSUBv4i16_OP2: 4609 case MachineCombinerPattern::MULSUBv8i16_OP1: 4610 case MachineCombinerPattern::MULSUBv8i16_OP2: 4611 case MachineCombinerPattern::MULSUBv2i32_OP1: 4612 case MachineCombinerPattern::MULSUBv2i32_OP2: 4613 case MachineCombinerPattern::MULSUBv4i32_OP1: 4614 case MachineCombinerPattern::MULSUBv4i32_OP2: 4615 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4616 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4617 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4618 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4619 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4620 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4621 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4622 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4623 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4624 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4625 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4626 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4627 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4628 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4629 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4630 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4631 return true; 4632 } // end switch (Pattern) 4633 return false; 4634 } 4635 /// Return true when there is potentially a faster code sequence for an 4636 /// instruction chain ending in \p Root. All potential patterns are listed in 4637 /// the \p Pattern vector. Pattern should be sorted in priority order since the 4638 /// pattern evaluator stops checking as soon as it finds a faster sequence. 4639 4640 bool AArch64InstrInfo::getMachineCombinerPatterns( 4641 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 4642 bool DoRegPressureReduce) const { 4643 // Integer patterns 4644 if (getMaddPatterns(Root, Patterns)) 4645 return true; 4646 // Floating point patterns 4647 if (getFMAPatterns(Root, Patterns)) 4648 return true; 4649 4650 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 4651 DoRegPressureReduce); 4652 } 4653 4654 enum class FMAInstKind { Default, Indexed, Accumulator }; 4655 /// genFusedMultiply - Generate fused multiply instructions. 4656 /// This function supports both integer and floating point instructions. 4657 /// A typical example: 4658 /// F|MUL I=A,B,0 4659 /// F|ADD R,I,C 4660 /// ==> F|MADD R,A,B,C 4661 /// \param MF Containing MachineFunction 4662 /// \param MRI Register information 4663 /// \param TII Target information 4664 /// \param Root is the F|ADD instruction 4665 /// \param [out] InsInstrs is a vector of machine instructions and will 4666 /// contain the generated madd instruction 4667 /// \param IdxMulOpd is index of operand in Root that is the result of 4668 /// the F|MUL. In the example above IdxMulOpd is 1. 4669 /// \param MaddOpc the opcode fo the f|madd instruction 4670 /// \param RC Register class of operands 4671 /// \param kind of fma instruction (addressing mode) to be generated 4672 /// \param ReplacedAddend is the result register from the instruction 4673 /// replacing the non-combined operand, if any. 4674 static MachineInstr * 4675 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 4676 const TargetInstrInfo *TII, MachineInstr &Root, 4677 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 4678 unsigned MaddOpc, const TargetRegisterClass *RC, 4679 FMAInstKind kind = FMAInstKind::Default, 4680 const Register *ReplacedAddend = nullptr) { 4681 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4682 4683 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 4684 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4685 Register ResultReg = Root.getOperand(0).getReg(); 4686 Register SrcReg0 = MUL->getOperand(1).getReg(); 4687 bool Src0IsKill = MUL->getOperand(1).isKill(); 4688 Register SrcReg1 = MUL->getOperand(2).getReg(); 4689 bool Src1IsKill = MUL->getOperand(2).isKill(); 4690 4691 unsigned SrcReg2; 4692 bool Src2IsKill; 4693 if (ReplacedAddend) { 4694 // If we just generated a new addend, we must be it's only use. 4695 SrcReg2 = *ReplacedAddend; 4696 Src2IsKill = true; 4697 } else { 4698 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 4699 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 4700 } 4701 4702 if (Register::isVirtualRegister(ResultReg)) 4703 MRI.constrainRegClass(ResultReg, RC); 4704 if (Register::isVirtualRegister(SrcReg0)) 4705 MRI.constrainRegClass(SrcReg0, RC); 4706 if (Register::isVirtualRegister(SrcReg1)) 4707 MRI.constrainRegClass(SrcReg1, RC); 4708 if (Register::isVirtualRegister(SrcReg2)) 4709 MRI.constrainRegClass(SrcReg2, RC); 4710 4711 MachineInstrBuilder MIB; 4712 if (kind == FMAInstKind::Default) 4713 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4714 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4715 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4716 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 4717 else if (kind == FMAInstKind::Indexed) 4718 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4719 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4720 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4721 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4722 .addImm(MUL->getOperand(3).getImm()); 4723 else if (kind == FMAInstKind::Accumulator) 4724 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4725 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4726 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4727 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 4728 else 4729 assert(false && "Invalid FMA instruction kind \n"); 4730 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 4731 InsInstrs.push_back(MIB); 4732 return MUL; 4733 } 4734 4735 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 4736 /// instructions. 4737 /// 4738 /// \see genFusedMultiply 4739 static MachineInstr *genFusedMultiplyAcc( 4740 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4741 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4742 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4743 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4744 FMAInstKind::Accumulator); 4745 } 4746 4747 /// genNeg - Helper to generate an intermediate negation of the second operand 4748 /// of Root 4749 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 4750 const TargetInstrInfo *TII, MachineInstr &Root, 4751 SmallVectorImpl<MachineInstr *> &InsInstrs, 4752 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 4753 unsigned MnegOpc, const TargetRegisterClass *RC) { 4754 Register NewVR = MRI.createVirtualRegister(RC); 4755 MachineInstrBuilder MIB = 4756 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 4757 .add(Root.getOperand(2)); 4758 InsInstrs.push_back(MIB); 4759 4760 assert(InstrIdxForVirtReg.empty()); 4761 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4762 4763 return NewVR; 4764 } 4765 4766 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4767 /// instructions with an additional negation of the accumulator 4768 static MachineInstr *genFusedMultiplyAccNeg( 4769 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4770 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4771 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4772 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4773 assert(IdxMulOpd == 1); 4774 4775 Register NewVR = 4776 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4777 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4778 FMAInstKind::Accumulator, &NewVR); 4779 } 4780 4781 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 4782 /// instructions. 4783 /// 4784 /// \see genFusedMultiply 4785 static MachineInstr *genFusedMultiplyIdx( 4786 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4787 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4788 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4789 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4790 FMAInstKind::Indexed); 4791 } 4792 4793 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4794 /// instructions with an additional negation of the accumulator 4795 static MachineInstr *genFusedMultiplyIdxNeg( 4796 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4797 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4798 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4799 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4800 assert(IdxMulOpd == 1); 4801 4802 Register NewVR = 4803 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4804 4805 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4806 FMAInstKind::Indexed, &NewVR); 4807 } 4808 4809 /// genMaddR - Generate madd instruction and combine mul and add using 4810 /// an extra virtual register 4811 /// Example - an ADD intermediate needs to be stored in a register: 4812 /// MUL I=A,B,0 4813 /// ADD R,I,Imm 4814 /// ==> ORR V, ZR, Imm 4815 /// ==> MADD R,A,B,V 4816 /// \param MF Containing MachineFunction 4817 /// \param MRI Register information 4818 /// \param TII Target information 4819 /// \param Root is the ADD instruction 4820 /// \param [out] InsInstrs is a vector of machine instructions and will 4821 /// contain the generated madd instruction 4822 /// \param IdxMulOpd is index of operand in Root that is the result of 4823 /// the MUL. In the example above IdxMulOpd is 1. 4824 /// \param MaddOpc the opcode fo the madd instruction 4825 /// \param VR is a virtual register that holds the value of an ADD operand 4826 /// (V in the example above). 4827 /// \param RC Register class of operands 4828 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 4829 const TargetInstrInfo *TII, MachineInstr &Root, 4830 SmallVectorImpl<MachineInstr *> &InsInstrs, 4831 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 4832 const TargetRegisterClass *RC) { 4833 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4834 4835 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4836 Register ResultReg = Root.getOperand(0).getReg(); 4837 Register SrcReg0 = MUL->getOperand(1).getReg(); 4838 bool Src0IsKill = MUL->getOperand(1).isKill(); 4839 Register SrcReg1 = MUL->getOperand(2).getReg(); 4840 bool Src1IsKill = MUL->getOperand(2).isKill(); 4841 4842 if (Register::isVirtualRegister(ResultReg)) 4843 MRI.constrainRegClass(ResultReg, RC); 4844 if (Register::isVirtualRegister(SrcReg0)) 4845 MRI.constrainRegClass(SrcReg0, RC); 4846 if (Register::isVirtualRegister(SrcReg1)) 4847 MRI.constrainRegClass(SrcReg1, RC); 4848 if (Register::isVirtualRegister(VR)) 4849 MRI.constrainRegClass(VR, RC); 4850 4851 MachineInstrBuilder MIB = 4852 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4853 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4854 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4855 .addReg(VR); 4856 // Insert the MADD 4857 InsInstrs.push_back(MIB); 4858 return MUL; 4859 } 4860 4861 /// When getMachineCombinerPatterns() finds potential patterns, 4862 /// this function generates the instructions that could replace the 4863 /// original code sequence 4864 void AArch64InstrInfo::genAlternativeCodeSequence( 4865 MachineInstr &Root, MachineCombinerPattern Pattern, 4866 SmallVectorImpl<MachineInstr *> &InsInstrs, 4867 SmallVectorImpl<MachineInstr *> &DelInstrs, 4868 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 4869 MachineBasicBlock &MBB = *Root.getParent(); 4870 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4871 MachineFunction &MF = *MBB.getParent(); 4872 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 4873 4874 MachineInstr *MUL = nullptr; 4875 const TargetRegisterClass *RC; 4876 unsigned Opc; 4877 switch (Pattern) { 4878 default: 4879 // Reassociate instructions. 4880 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 4881 DelInstrs, InstrIdxForVirtReg); 4882 return; 4883 case MachineCombinerPattern::MULADDW_OP1: 4884 case MachineCombinerPattern::MULADDX_OP1: 4885 // MUL I=A,B,0 4886 // ADD R,I,C 4887 // ==> MADD R,A,B,C 4888 // --- Create(MADD); 4889 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 4890 Opc = AArch64::MADDWrrr; 4891 RC = &AArch64::GPR32RegClass; 4892 } else { 4893 Opc = AArch64::MADDXrrr; 4894 RC = &AArch64::GPR64RegClass; 4895 } 4896 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4897 break; 4898 case MachineCombinerPattern::MULADDW_OP2: 4899 case MachineCombinerPattern::MULADDX_OP2: 4900 // MUL I=A,B,0 4901 // ADD R,C,I 4902 // ==> MADD R,A,B,C 4903 // --- Create(MADD); 4904 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 4905 Opc = AArch64::MADDWrrr; 4906 RC = &AArch64::GPR32RegClass; 4907 } else { 4908 Opc = AArch64::MADDXrrr; 4909 RC = &AArch64::GPR64RegClass; 4910 } 4911 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4912 break; 4913 case MachineCombinerPattern::MULADDWI_OP1: 4914 case MachineCombinerPattern::MULADDXI_OP1: { 4915 // MUL I=A,B,0 4916 // ADD R,I,Imm 4917 // ==> ORR V, ZR, Imm 4918 // ==> MADD R,A,B,V 4919 // --- Create(MADD); 4920 const TargetRegisterClass *OrrRC; 4921 unsigned BitSize, OrrOpc, ZeroReg; 4922 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 4923 OrrOpc = AArch64::ORRWri; 4924 OrrRC = &AArch64::GPR32spRegClass; 4925 BitSize = 32; 4926 ZeroReg = AArch64::WZR; 4927 Opc = AArch64::MADDWrrr; 4928 RC = &AArch64::GPR32RegClass; 4929 } else { 4930 OrrOpc = AArch64::ORRXri; 4931 OrrRC = &AArch64::GPR64spRegClass; 4932 BitSize = 64; 4933 ZeroReg = AArch64::XZR; 4934 Opc = AArch64::MADDXrrr; 4935 RC = &AArch64::GPR64RegClass; 4936 } 4937 Register NewVR = MRI.createVirtualRegister(OrrRC); 4938 uint64_t Imm = Root.getOperand(2).getImm(); 4939 4940 if (Root.getOperand(3).isImm()) { 4941 unsigned Val = Root.getOperand(3).getImm(); 4942 Imm = Imm << Val; 4943 } 4944 uint64_t UImm = SignExtend64(Imm, BitSize); 4945 uint64_t Encoding; 4946 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4947 MachineInstrBuilder MIB1 = 4948 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4949 .addReg(ZeroReg) 4950 .addImm(Encoding); 4951 InsInstrs.push_back(MIB1); 4952 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4953 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4954 } 4955 break; 4956 } 4957 case MachineCombinerPattern::MULSUBW_OP1: 4958 case MachineCombinerPattern::MULSUBX_OP1: { 4959 // MUL I=A,B,0 4960 // SUB R,I, C 4961 // ==> SUB V, 0, C 4962 // ==> MADD R,A,B,V // = -C + A*B 4963 // --- Create(MADD); 4964 const TargetRegisterClass *SubRC; 4965 unsigned SubOpc, ZeroReg; 4966 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 4967 SubOpc = AArch64::SUBWrr; 4968 SubRC = &AArch64::GPR32spRegClass; 4969 ZeroReg = AArch64::WZR; 4970 Opc = AArch64::MADDWrrr; 4971 RC = &AArch64::GPR32RegClass; 4972 } else { 4973 SubOpc = AArch64::SUBXrr; 4974 SubRC = &AArch64::GPR64spRegClass; 4975 ZeroReg = AArch64::XZR; 4976 Opc = AArch64::MADDXrrr; 4977 RC = &AArch64::GPR64RegClass; 4978 } 4979 Register NewVR = MRI.createVirtualRegister(SubRC); 4980 // SUB NewVR, 0, C 4981 MachineInstrBuilder MIB1 = 4982 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 4983 .addReg(ZeroReg) 4984 .add(Root.getOperand(2)); 4985 InsInstrs.push_back(MIB1); 4986 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4987 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4988 break; 4989 } 4990 case MachineCombinerPattern::MULSUBW_OP2: 4991 case MachineCombinerPattern::MULSUBX_OP2: 4992 // MUL I=A,B,0 4993 // SUB R,C,I 4994 // ==> MSUB R,A,B,C (computes C - A*B) 4995 // --- Create(MSUB); 4996 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 4997 Opc = AArch64::MSUBWrrr; 4998 RC = &AArch64::GPR32RegClass; 4999 } else { 5000 Opc = AArch64::MSUBXrrr; 5001 RC = &AArch64::GPR64RegClass; 5002 } 5003 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5004 break; 5005 case MachineCombinerPattern::MULSUBWI_OP1: 5006 case MachineCombinerPattern::MULSUBXI_OP1: { 5007 // MUL I=A,B,0 5008 // SUB R,I, Imm 5009 // ==> ORR V, ZR, -Imm 5010 // ==> MADD R,A,B,V // = -Imm + A*B 5011 // --- Create(MADD); 5012 const TargetRegisterClass *OrrRC; 5013 unsigned BitSize, OrrOpc, ZeroReg; 5014 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 5015 OrrOpc = AArch64::ORRWri; 5016 OrrRC = &AArch64::GPR32spRegClass; 5017 BitSize = 32; 5018 ZeroReg = AArch64::WZR; 5019 Opc = AArch64::MADDWrrr; 5020 RC = &AArch64::GPR32RegClass; 5021 } else { 5022 OrrOpc = AArch64::ORRXri; 5023 OrrRC = &AArch64::GPR64spRegClass; 5024 BitSize = 64; 5025 ZeroReg = AArch64::XZR; 5026 Opc = AArch64::MADDXrrr; 5027 RC = &AArch64::GPR64RegClass; 5028 } 5029 Register NewVR = MRI.createVirtualRegister(OrrRC); 5030 uint64_t Imm = Root.getOperand(2).getImm(); 5031 if (Root.getOperand(3).isImm()) { 5032 unsigned Val = Root.getOperand(3).getImm(); 5033 Imm = Imm << Val; 5034 } 5035 uint64_t UImm = SignExtend64(-Imm, BitSize); 5036 uint64_t Encoding; 5037 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5038 MachineInstrBuilder MIB1 = 5039 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5040 .addReg(ZeroReg) 5041 .addImm(Encoding); 5042 InsInstrs.push_back(MIB1); 5043 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5044 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5045 } 5046 break; 5047 } 5048 5049 case MachineCombinerPattern::MULADDv8i8_OP1: 5050 Opc = AArch64::MLAv8i8; 5051 RC = &AArch64::FPR64RegClass; 5052 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5053 break; 5054 case MachineCombinerPattern::MULADDv8i8_OP2: 5055 Opc = AArch64::MLAv8i8; 5056 RC = &AArch64::FPR64RegClass; 5057 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5058 break; 5059 case MachineCombinerPattern::MULADDv16i8_OP1: 5060 Opc = AArch64::MLAv16i8; 5061 RC = &AArch64::FPR128RegClass; 5062 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5063 break; 5064 case MachineCombinerPattern::MULADDv16i8_OP2: 5065 Opc = AArch64::MLAv16i8; 5066 RC = &AArch64::FPR128RegClass; 5067 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5068 break; 5069 case MachineCombinerPattern::MULADDv4i16_OP1: 5070 Opc = AArch64::MLAv4i16; 5071 RC = &AArch64::FPR64RegClass; 5072 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5073 break; 5074 case MachineCombinerPattern::MULADDv4i16_OP2: 5075 Opc = AArch64::MLAv4i16; 5076 RC = &AArch64::FPR64RegClass; 5077 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5078 break; 5079 case MachineCombinerPattern::MULADDv8i16_OP1: 5080 Opc = AArch64::MLAv8i16; 5081 RC = &AArch64::FPR128RegClass; 5082 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5083 break; 5084 case MachineCombinerPattern::MULADDv8i16_OP2: 5085 Opc = AArch64::MLAv8i16; 5086 RC = &AArch64::FPR128RegClass; 5087 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5088 break; 5089 case MachineCombinerPattern::MULADDv2i32_OP1: 5090 Opc = AArch64::MLAv2i32; 5091 RC = &AArch64::FPR64RegClass; 5092 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5093 break; 5094 case MachineCombinerPattern::MULADDv2i32_OP2: 5095 Opc = AArch64::MLAv2i32; 5096 RC = &AArch64::FPR64RegClass; 5097 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5098 break; 5099 case MachineCombinerPattern::MULADDv4i32_OP1: 5100 Opc = AArch64::MLAv4i32; 5101 RC = &AArch64::FPR128RegClass; 5102 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5103 break; 5104 case MachineCombinerPattern::MULADDv4i32_OP2: 5105 Opc = AArch64::MLAv4i32; 5106 RC = &AArch64::FPR128RegClass; 5107 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5108 break; 5109 5110 case MachineCombinerPattern::MULSUBv8i8_OP1: 5111 Opc = AArch64::MLAv8i8; 5112 RC = &AArch64::FPR64RegClass; 5113 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5114 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 5115 RC); 5116 break; 5117 case MachineCombinerPattern::MULSUBv8i8_OP2: 5118 Opc = AArch64::MLSv8i8; 5119 RC = &AArch64::FPR64RegClass; 5120 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5121 break; 5122 case MachineCombinerPattern::MULSUBv16i8_OP1: 5123 Opc = AArch64::MLAv16i8; 5124 RC = &AArch64::FPR128RegClass; 5125 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5126 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 5127 RC); 5128 break; 5129 case MachineCombinerPattern::MULSUBv16i8_OP2: 5130 Opc = AArch64::MLSv16i8; 5131 RC = &AArch64::FPR128RegClass; 5132 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5133 break; 5134 case MachineCombinerPattern::MULSUBv4i16_OP1: 5135 Opc = AArch64::MLAv4i16; 5136 RC = &AArch64::FPR64RegClass; 5137 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5138 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5139 RC); 5140 break; 5141 case MachineCombinerPattern::MULSUBv4i16_OP2: 5142 Opc = AArch64::MLSv4i16; 5143 RC = &AArch64::FPR64RegClass; 5144 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5145 break; 5146 case MachineCombinerPattern::MULSUBv8i16_OP1: 5147 Opc = AArch64::MLAv8i16; 5148 RC = &AArch64::FPR128RegClass; 5149 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5150 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5151 RC); 5152 break; 5153 case MachineCombinerPattern::MULSUBv8i16_OP2: 5154 Opc = AArch64::MLSv8i16; 5155 RC = &AArch64::FPR128RegClass; 5156 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5157 break; 5158 case MachineCombinerPattern::MULSUBv2i32_OP1: 5159 Opc = AArch64::MLAv2i32; 5160 RC = &AArch64::FPR64RegClass; 5161 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5162 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5163 RC); 5164 break; 5165 case MachineCombinerPattern::MULSUBv2i32_OP2: 5166 Opc = AArch64::MLSv2i32; 5167 RC = &AArch64::FPR64RegClass; 5168 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5169 break; 5170 case MachineCombinerPattern::MULSUBv4i32_OP1: 5171 Opc = AArch64::MLAv4i32; 5172 RC = &AArch64::FPR128RegClass; 5173 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5174 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5175 RC); 5176 break; 5177 case MachineCombinerPattern::MULSUBv4i32_OP2: 5178 Opc = AArch64::MLSv4i32; 5179 RC = &AArch64::FPR128RegClass; 5180 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5181 break; 5182 5183 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5184 Opc = AArch64::MLAv4i16_indexed; 5185 RC = &AArch64::FPR64RegClass; 5186 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5187 break; 5188 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5189 Opc = AArch64::MLAv4i16_indexed; 5190 RC = &AArch64::FPR64RegClass; 5191 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5192 break; 5193 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5194 Opc = AArch64::MLAv8i16_indexed; 5195 RC = &AArch64::FPR128RegClass; 5196 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5197 break; 5198 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5199 Opc = AArch64::MLAv8i16_indexed; 5200 RC = &AArch64::FPR128RegClass; 5201 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5202 break; 5203 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5204 Opc = AArch64::MLAv2i32_indexed; 5205 RC = &AArch64::FPR64RegClass; 5206 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5207 break; 5208 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5209 Opc = AArch64::MLAv2i32_indexed; 5210 RC = &AArch64::FPR64RegClass; 5211 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5212 break; 5213 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5214 Opc = AArch64::MLAv4i32_indexed; 5215 RC = &AArch64::FPR128RegClass; 5216 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5217 break; 5218 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5219 Opc = AArch64::MLAv4i32_indexed; 5220 RC = &AArch64::FPR128RegClass; 5221 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5222 break; 5223 5224 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5225 Opc = AArch64::MLAv4i16_indexed; 5226 RC = &AArch64::FPR64RegClass; 5227 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5228 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5229 RC); 5230 break; 5231 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5232 Opc = AArch64::MLSv4i16_indexed; 5233 RC = &AArch64::FPR64RegClass; 5234 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5235 break; 5236 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5237 Opc = AArch64::MLAv8i16_indexed; 5238 RC = &AArch64::FPR128RegClass; 5239 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5240 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5241 RC); 5242 break; 5243 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5244 Opc = AArch64::MLSv8i16_indexed; 5245 RC = &AArch64::FPR128RegClass; 5246 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5247 break; 5248 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5249 Opc = AArch64::MLAv2i32_indexed; 5250 RC = &AArch64::FPR64RegClass; 5251 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5252 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5253 RC); 5254 break; 5255 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5256 Opc = AArch64::MLSv2i32_indexed; 5257 RC = &AArch64::FPR64RegClass; 5258 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5259 break; 5260 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5261 Opc = AArch64::MLAv4i32_indexed; 5262 RC = &AArch64::FPR128RegClass; 5263 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5264 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5265 RC); 5266 break; 5267 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5268 Opc = AArch64::MLSv4i32_indexed; 5269 RC = &AArch64::FPR128RegClass; 5270 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5271 break; 5272 5273 // Floating Point Support 5274 case MachineCombinerPattern::FMULADDH_OP1: 5275 Opc = AArch64::FMADDHrrr; 5276 RC = &AArch64::FPR16RegClass; 5277 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5278 break; 5279 case MachineCombinerPattern::FMULADDS_OP1: 5280 Opc = AArch64::FMADDSrrr; 5281 RC = &AArch64::FPR32RegClass; 5282 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5283 break; 5284 case MachineCombinerPattern::FMULADDD_OP1: 5285 Opc = AArch64::FMADDDrrr; 5286 RC = &AArch64::FPR64RegClass; 5287 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5288 break; 5289 5290 case MachineCombinerPattern::FMULADDH_OP2: 5291 Opc = AArch64::FMADDHrrr; 5292 RC = &AArch64::FPR16RegClass; 5293 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5294 break; 5295 case MachineCombinerPattern::FMULADDS_OP2: 5296 Opc = AArch64::FMADDSrrr; 5297 RC = &AArch64::FPR32RegClass; 5298 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5299 break; 5300 case MachineCombinerPattern::FMULADDD_OP2: 5301 Opc = AArch64::FMADDDrrr; 5302 RC = &AArch64::FPR64RegClass; 5303 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5304 break; 5305 5306 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5307 Opc = AArch64::FMLAv1i32_indexed; 5308 RC = &AArch64::FPR32RegClass; 5309 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5310 FMAInstKind::Indexed); 5311 break; 5312 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5313 Opc = AArch64::FMLAv1i32_indexed; 5314 RC = &AArch64::FPR32RegClass; 5315 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5316 FMAInstKind::Indexed); 5317 break; 5318 5319 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5320 Opc = AArch64::FMLAv1i64_indexed; 5321 RC = &AArch64::FPR64RegClass; 5322 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5323 FMAInstKind::Indexed); 5324 break; 5325 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5326 Opc = AArch64::FMLAv1i64_indexed; 5327 RC = &AArch64::FPR64RegClass; 5328 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5329 FMAInstKind::Indexed); 5330 break; 5331 5332 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5333 RC = &AArch64::FPR64RegClass; 5334 Opc = AArch64::FMLAv4i16_indexed; 5335 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5336 FMAInstKind::Indexed); 5337 break; 5338 case MachineCombinerPattern::FMLAv4f16_OP1: 5339 RC = &AArch64::FPR64RegClass; 5340 Opc = AArch64::FMLAv4f16; 5341 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5342 FMAInstKind::Accumulator); 5343 break; 5344 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5345 RC = &AArch64::FPR64RegClass; 5346 Opc = AArch64::FMLAv4i16_indexed; 5347 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5348 FMAInstKind::Indexed); 5349 break; 5350 case MachineCombinerPattern::FMLAv4f16_OP2: 5351 RC = &AArch64::FPR64RegClass; 5352 Opc = AArch64::FMLAv4f16; 5353 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5354 FMAInstKind::Accumulator); 5355 break; 5356 5357 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5358 case MachineCombinerPattern::FMLAv2f32_OP1: 5359 RC = &AArch64::FPR64RegClass; 5360 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 5361 Opc = AArch64::FMLAv2i32_indexed; 5362 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5363 FMAInstKind::Indexed); 5364 } else { 5365 Opc = AArch64::FMLAv2f32; 5366 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5367 FMAInstKind::Accumulator); 5368 } 5369 break; 5370 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5371 case MachineCombinerPattern::FMLAv2f32_OP2: 5372 RC = &AArch64::FPR64RegClass; 5373 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 5374 Opc = AArch64::FMLAv2i32_indexed; 5375 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5376 FMAInstKind::Indexed); 5377 } else { 5378 Opc = AArch64::FMLAv2f32; 5379 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5380 FMAInstKind::Accumulator); 5381 } 5382 break; 5383 5384 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5385 RC = &AArch64::FPR128RegClass; 5386 Opc = AArch64::FMLAv8i16_indexed; 5387 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5388 FMAInstKind::Indexed); 5389 break; 5390 case MachineCombinerPattern::FMLAv8f16_OP1: 5391 RC = &AArch64::FPR128RegClass; 5392 Opc = AArch64::FMLAv8f16; 5393 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5394 FMAInstKind::Accumulator); 5395 break; 5396 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5397 RC = &AArch64::FPR128RegClass; 5398 Opc = AArch64::FMLAv8i16_indexed; 5399 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5400 FMAInstKind::Indexed); 5401 break; 5402 case MachineCombinerPattern::FMLAv8f16_OP2: 5403 RC = &AArch64::FPR128RegClass; 5404 Opc = AArch64::FMLAv8f16; 5405 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5406 FMAInstKind::Accumulator); 5407 break; 5408 5409 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5410 case MachineCombinerPattern::FMLAv2f64_OP1: 5411 RC = &AArch64::FPR128RegClass; 5412 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 5413 Opc = AArch64::FMLAv2i64_indexed; 5414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5415 FMAInstKind::Indexed); 5416 } else { 5417 Opc = AArch64::FMLAv2f64; 5418 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5419 FMAInstKind::Accumulator); 5420 } 5421 break; 5422 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5423 case MachineCombinerPattern::FMLAv2f64_OP2: 5424 RC = &AArch64::FPR128RegClass; 5425 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 5426 Opc = AArch64::FMLAv2i64_indexed; 5427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5428 FMAInstKind::Indexed); 5429 } else { 5430 Opc = AArch64::FMLAv2f64; 5431 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5432 FMAInstKind::Accumulator); 5433 } 5434 break; 5435 5436 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5437 case MachineCombinerPattern::FMLAv4f32_OP1: 5438 RC = &AArch64::FPR128RegClass; 5439 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 5440 Opc = AArch64::FMLAv4i32_indexed; 5441 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5442 FMAInstKind::Indexed); 5443 } else { 5444 Opc = AArch64::FMLAv4f32; 5445 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5446 FMAInstKind::Accumulator); 5447 } 5448 break; 5449 5450 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5451 case MachineCombinerPattern::FMLAv4f32_OP2: 5452 RC = &AArch64::FPR128RegClass; 5453 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 5454 Opc = AArch64::FMLAv4i32_indexed; 5455 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5456 FMAInstKind::Indexed); 5457 } else { 5458 Opc = AArch64::FMLAv4f32; 5459 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5460 FMAInstKind::Accumulator); 5461 } 5462 break; 5463 5464 case MachineCombinerPattern::FMULSUBH_OP1: 5465 Opc = AArch64::FNMSUBHrrr; 5466 RC = &AArch64::FPR16RegClass; 5467 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5468 break; 5469 case MachineCombinerPattern::FMULSUBS_OP1: 5470 Opc = AArch64::FNMSUBSrrr; 5471 RC = &AArch64::FPR32RegClass; 5472 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5473 break; 5474 case MachineCombinerPattern::FMULSUBD_OP1: 5475 Opc = AArch64::FNMSUBDrrr; 5476 RC = &AArch64::FPR64RegClass; 5477 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5478 break; 5479 5480 case MachineCombinerPattern::FNMULSUBH_OP1: 5481 Opc = AArch64::FNMADDHrrr; 5482 RC = &AArch64::FPR16RegClass; 5483 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5484 break; 5485 case MachineCombinerPattern::FNMULSUBS_OP1: 5486 Opc = AArch64::FNMADDSrrr; 5487 RC = &AArch64::FPR32RegClass; 5488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5489 break; 5490 case MachineCombinerPattern::FNMULSUBD_OP1: 5491 Opc = AArch64::FNMADDDrrr; 5492 RC = &AArch64::FPR64RegClass; 5493 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5494 break; 5495 5496 case MachineCombinerPattern::FMULSUBH_OP2: 5497 Opc = AArch64::FMSUBHrrr; 5498 RC = &AArch64::FPR16RegClass; 5499 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5500 break; 5501 case MachineCombinerPattern::FMULSUBS_OP2: 5502 Opc = AArch64::FMSUBSrrr; 5503 RC = &AArch64::FPR32RegClass; 5504 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5505 break; 5506 case MachineCombinerPattern::FMULSUBD_OP2: 5507 Opc = AArch64::FMSUBDrrr; 5508 RC = &AArch64::FPR64RegClass; 5509 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5510 break; 5511 5512 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5513 Opc = AArch64::FMLSv1i32_indexed; 5514 RC = &AArch64::FPR32RegClass; 5515 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5516 FMAInstKind::Indexed); 5517 break; 5518 5519 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5520 Opc = AArch64::FMLSv1i64_indexed; 5521 RC = &AArch64::FPR64RegClass; 5522 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5523 FMAInstKind::Indexed); 5524 break; 5525 5526 case MachineCombinerPattern::FMLSv4f16_OP1: 5527 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5528 RC = &AArch64::FPR64RegClass; 5529 Register NewVR = MRI.createVirtualRegister(RC); 5530 MachineInstrBuilder MIB1 = 5531 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5532 .add(Root.getOperand(2)); 5533 InsInstrs.push_back(MIB1); 5534 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5535 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5536 Opc = AArch64::FMLAv4f16; 5537 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5538 FMAInstKind::Accumulator, &NewVR); 5539 } else { 5540 Opc = AArch64::FMLAv4i16_indexed; 5541 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5542 FMAInstKind::Indexed, &NewVR); 5543 } 5544 break; 5545 } 5546 case MachineCombinerPattern::FMLSv4f16_OP2: 5547 RC = &AArch64::FPR64RegClass; 5548 Opc = AArch64::FMLSv4f16; 5549 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5550 FMAInstKind::Accumulator); 5551 break; 5552 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5553 RC = &AArch64::FPR64RegClass; 5554 Opc = AArch64::FMLSv4i16_indexed; 5555 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5556 FMAInstKind::Indexed); 5557 break; 5558 5559 case MachineCombinerPattern::FMLSv2f32_OP2: 5560 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5561 RC = &AArch64::FPR64RegClass; 5562 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5563 Opc = AArch64::FMLSv2i32_indexed; 5564 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5565 FMAInstKind::Indexed); 5566 } else { 5567 Opc = AArch64::FMLSv2f32; 5568 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5569 FMAInstKind::Accumulator); 5570 } 5571 break; 5572 5573 case MachineCombinerPattern::FMLSv8f16_OP1: 5574 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5575 RC = &AArch64::FPR128RegClass; 5576 Register NewVR = MRI.createVirtualRegister(RC); 5577 MachineInstrBuilder MIB1 = 5578 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5579 .add(Root.getOperand(2)); 5580 InsInstrs.push_back(MIB1); 5581 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5582 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5583 Opc = AArch64::FMLAv8f16; 5584 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5585 FMAInstKind::Accumulator, &NewVR); 5586 } else { 5587 Opc = AArch64::FMLAv8i16_indexed; 5588 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5589 FMAInstKind::Indexed, &NewVR); 5590 } 5591 break; 5592 } 5593 case MachineCombinerPattern::FMLSv8f16_OP2: 5594 RC = &AArch64::FPR128RegClass; 5595 Opc = AArch64::FMLSv8f16; 5596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5597 FMAInstKind::Accumulator); 5598 break; 5599 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5600 RC = &AArch64::FPR128RegClass; 5601 Opc = AArch64::FMLSv8i16_indexed; 5602 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5603 FMAInstKind::Indexed); 5604 break; 5605 5606 case MachineCombinerPattern::FMLSv2f64_OP2: 5607 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5608 RC = &AArch64::FPR128RegClass; 5609 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 5610 Opc = AArch64::FMLSv2i64_indexed; 5611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5612 FMAInstKind::Indexed); 5613 } else { 5614 Opc = AArch64::FMLSv2f64; 5615 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5616 FMAInstKind::Accumulator); 5617 } 5618 break; 5619 5620 case MachineCombinerPattern::FMLSv4f32_OP2: 5621 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5622 RC = &AArch64::FPR128RegClass; 5623 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 5624 Opc = AArch64::FMLSv4i32_indexed; 5625 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5626 FMAInstKind::Indexed); 5627 } else { 5628 Opc = AArch64::FMLSv4f32; 5629 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5630 FMAInstKind::Accumulator); 5631 } 5632 break; 5633 case MachineCombinerPattern::FMLSv2f32_OP1: 5634 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 5635 RC = &AArch64::FPR64RegClass; 5636 Register NewVR = MRI.createVirtualRegister(RC); 5637 MachineInstrBuilder MIB1 = 5638 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 5639 .add(Root.getOperand(2)); 5640 InsInstrs.push_back(MIB1); 5641 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5642 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 5643 Opc = AArch64::FMLAv2i32_indexed; 5644 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5645 FMAInstKind::Indexed, &NewVR); 5646 } else { 5647 Opc = AArch64::FMLAv2f32; 5648 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5649 FMAInstKind::Accumulator, &NewVR); 5650 } 5651 break; 5652 } 5653 case MachineCombinerPattern::FMLSv4f32_OP1: 5654 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 5655 RC = &AArch64::FPR128RegClass; 5656 Register NewVR = MRI.createVirtualRegister(RC); 5657 MachineInstrBuilder MIB1 = 5658 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 5659 .add(Root.getOperand(2)); 5660 InsInstrs.push_back(MIB1); 5661 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5662 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 5663 Opc = AArch64::FMLAv4i32_indexed; 5664 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5665 FMAInstKind::Indexed, &NewVR); 5666 } else { 5667 Opc = AArch64::FMLAv4f32; 5668 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5669 FMAInstKind::Accumulator, &NewVR); 5670 } 5671 break; 5672 } 5673 case MachineCombinerPattern::FMLSv2f64_OP1: 5674 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 5675 RC = &AArch64::FPR128RegClass; 5676 Register NewVR = MRI.createVirtualRegister(RC); 5677 MachineInstrBuilder MIB1 = 5678 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 5679 .add(Root.getOperand(2)); 5680 InsInstrs.push_back(MIB1); 5681 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5682 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 5683 Opc = AArch64::FMLAv2i64_indexed; 5684 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5685 FMAInstKind::Indexed, &NewVR); 5686 } else { 5687 Opc = AArch64::FMLAv2f64; 5688 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5689 FMAInstKind::Accumulator, &NewVR); 5690 } 5691 break; 5692 } 5693 } // end switch (Pattern) 5694 // Record MUL and ADD/SUB for deletion 5695 // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and 5696 // CodeGen/AArch64/urem-seteq-nonzero.ll. 5697 // assert(MUL && "MUL was never set"); 5698 DelInstrs.push_back(MUL); 5699 DelInstrs.push_back(&Root); 5700 } 5701 5702 /// Replace csincr-branch sequence by simple conditional branch 5703 /// 5704 /// Examples: 5705 /// 1. \code 5706 /// csinc w9, wzr, wzr, <condition code> 5707 /// tbnz w9, #0, 0x44 5708 /// \endcode 5709 /// to 5710 /// \code 5711 /// b.<inverted condition code> 5712 /// \endcode 5713 /// 5714 /// 2. \code 5715 /// csinc w9, wzr, wzr, <condition code> 5716 /// tbz w9, #0, 0x44 5717 /// \endcode 5718 /// to 5719 /// \code 5720 /// b.<condition code> 5721 /// \endcode 5722 /// 5723 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 5724 /// compare's constant operand is power of 2. 5725 /// 5726 /// Examples: 5727 /// \code 5728 /// and w8, w8, #0x400 5729 /// cbnz w8, L1 5730 /// \endcode 5731 /// to 5732 /// \code 5733 /// tbnz w8, #10, L1 5734 /// \endcode 5735 /// 5736 /// \param MI Conditional Branch 5737 /// \return True when the simple conditional branch is generated 5738 /// 5739 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 5740 bool IsNegativeBranch = false; 5741 bool IsTestAndBranch = false; 5742 unsigned TargetBBInMI = 0; 5743 switch (MI.getOpcode()) { 5744 default: 5745 llvm_unreachable("Unknown branch instruction?"); 5746 case AArch64::Bcc: 5747 return false; 5748 case AArch64::CBZW: 5749 case AArch64::CBZX: 5750 TargetBBInMI = 1; 5751 break; 5752 case AArch64::CBNZW: 5753 case AArch64::CBNZX: 5754 TargetBBInMI = 1; 5755 IsNegativeBranch = true; 5756 break; 5757 case AArch64::TBZW: 5758 case AArch64::TBZX: 5759 TargetBBInMI = 2; 5760 IsTestAndBranch = true; 5761 break; 5762 case AArch64::TBNZW: 5763 case AArch64::TBNZX: 5764 TargetBBInMI = 2; 5765 IsNegativeBranch = true; 5766 IsTestAndBranch = true; 5767 break; 5768 } 5769 // So we increment a zero register and test for bits other 5770 // than bit 0? Conservatively bail out in case the verifier 5771 // missed this case. 5772 if (IsTestAndBranch && MI.getOperand(1).getImm()) 5773 return false; 5774 5775 // Find Definition. 5776 assert(MI.getParent() && "Incomplete machine instruciton\n"); 5777 MachineBasicBlock *MBB = MI.getParent(); 5778 MachineFunction *MF = MBB->getParent(); 5779 MachineRegisterInfo *MRI = &MF->getRegInfo(); 5780 Register VReg = MI.getOperand(0).getReg(); 5781 if (!Register::isVirtualRegister(VReg)) 5782 return false; 5783 5784 MachineInstr *DefMI = MRI->getVRegDef(VReg); 5785 5786 // Look through COPY instructions to find definition. 5787 while (DefMI->isCopy()) { 5788 Register CopyVReg = DefMI->getOperand(1).getReg(); 5789 if (!MRI->hasOneNonDBGUse(CopyVReg)) 5790 return false; 5791 if (!MRI->hasOneDef(CopyVReg)) 5792 return false; 5793 DefMI = MRI->getVRegDef(CopyVReg); 5794 } 5795 5796 switch (DefMI->getOpcode()) { 5797 default: 5798 return false; 5799 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 5800 case AArch64::ANDWri: 5801 case AArch64::ANDXri: { 5802 if (IsTestAndBranch) 5803 return false; 5804 if (DefMI->getParent() != MBB) 5805 return false; 5806 if (!MRI->hasOneNonDBGUse(VReg)) 5807 return false; 5808 5809 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 5810 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 5811 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 5812 if (!isPowerOf2_64(Mask)) 5813 return false; 5814 5815 MachineOperand &MO = DefMI->getOperand(1); 5816 Register NewReg = MO.getReg(); 5817 if (!Register::isVirtualRegister(NewReg)) 5818 return false; 5819 5820 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 5821 5822 MachineBasicBlock &RefToMBB = *MBB; 5823 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 5824 DebugLoc DL = MI.getDebugLoc(); 5825 unsigned Imm = Log2_64(Mask); 5826 unsigned Opc = (Imm < 32) 5827 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 5828 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 5829 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 5830 .addReg(NewReg) 5831 .addImm(Imm) 5832 .addMBB(TBB); 5833 // Register lives on to the CBZ now. 5834 MO.setIsKill(false); 5835 5836 // For immediate smaller than 32, we need to use the 32-bit 5837 // variant (W) in all cases. Indeed the 64-bit variant does not 5838 // allow to encode them. 5839 // Therefore, if the input register is 64-bit, we need to take the 5840 // 32-bit sub-part. 5841 if (!Is32Bit && Imm < 32) 5842 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 5843 MI.eraseFromParent(); 5844 return true; 5845 } 5846 // Look for CSINC 5847 case AArch64::CSINCWr: 5848 case AArch64::CSINCXr: { 5849 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 5850 DefMI->getOperand(2).getReg() == AArch64::WZR) && 5851 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 5852 DefMI->getOperand(2).getReg() == AArch64::XZR)) 5853 return false; 5854 5855 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 5856 return false; 5857 5858 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 5859 // Convert only when the condition code is not modified between 5860 // the CSINC and the branch. The CC may be used by other 5861 // instructions in between. 5862 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 5863 return false; 5864 MachineBasicBlock &RefToMBB = *MBB; 5865 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 5866 DebugLoc DL = MI.getDebugLoc(); 5867 if (IsNegativeBranch) 5868 CC = AArch64CC::getInvertedCondCode(CC); 5869 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 5870 MI.eraseFromParent(); 5871 return true; 5872 } 5873 } 5874 } 5875 5876 std::pair<unsigned, unsigned> 5877 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 5878 const unsigned Mask = AArch64II::MO_FRAGMENT; 5879 return std::make_pair(TF & Mask, TF & ~Mask); 5880 } 5881 5882 ArrayRef<std::pair<unsigned, const char *>> 5883 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 5884 using namespace AArch64II; 5885 5886 static const std::pair<unsigned, const char *> TargetFlags[] = { 5887 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 5888 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 5889 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 5890 {MO_HI12, "aarch64-hi12"}}; 5891 return makeArrayRef(TargetFlags); 5892 } 5893 5894 ArrayRef<std::pair<unsigned, const char *>> 5895 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 5896 using namespace AArch64II; 5897 5898 static const std::pair<unsigned, const char *> TargetFlags[] = { 5899 {MO_COFFSTUB, "aarch64-coffstub"}, 5900 {MO_GOT, "aarch64-got"}, 5901 {MO_NC, "aarch64-nc"}, 5902 {MO_S, "aarch64-s"}, 5903 {MO_TLS, "aarch64-tls"}, 5904 {MO_DLLIMPORT, "aarch64-dllimport"}, 5905 {MO_PREL, "aarch64-prel"}, 5906 {MO_TAGGED, "aarch64-tagged"}}; 5907 return makeArrayRef(TargetFlags); 5908 } 5909 5910 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 5911 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 5912 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 5913 {{MOSuppressPair, "aarch64-suppress-pair"}, 5914 {MOStridedAccess, "aarch64-strided-access"}}; 5915 return makeArrayRef(TargetFlags); 5916 } 5917 5918 /// Constants defining how certain sequences should be outlined. 5919 /// This encompasses how an outlined function should be called, and what kind of 5920 /// frame should be emitted for that outlined function. 5921 /// 5922 /// \p MachineOutlinerDefault implies that the function should be called with 5923 /// a save and restore of LR to the stack. 5924 /// 5925 /// That is, 5926 /// 5927 /// I1 Save LR OUTLINED_FUNCTION: 5928 /// I2 --> BL OUTLINED_FUNCTION I1 5929 /// I3 Restore LR I2 5930 /// I3 5931 /// RET 5932 /// 5933 /// * Call construction overhead: 3 (save + BL + restore) 5934 /// * Frame construction overhead: 1 (ret) 5935 /// * Requires stack fixups? Yes 5936 /// 5937 /// \p MachineOutlinerTailCall implies that the function is being created from 5938 /// a sequence of instructions ending in a return. 5939 /// 5940 /// That is, 5941 /// 5942 /// I1 OUTLINED_FUNCTION: 5943 /// I2 --> B OUTLINED_FUNCTION I1 5944 /// RET I2 5945 /// RET 5946 /// 5947 /// * Call construction overhead: 1 (B) 5948 /// * Frame construction overhead: 0 (Return included in sequence) 5949 /// * Requires stack fixups? No 5950 /// 5951 /// \p MachineOutlinerNoLRSave implies that the function should be called using 5952 /// a BL instruction, but doesn't require LR to be saved and restored. This 5953 /// happens when LR is known to be dead. 5954 /// 5955 /// That is, 5956 /// 5957 /// I1 OUTLINED_FUNCTION: 5958 /// I2 --> BL OUTLINED_FUNCTION I1 5959 /// I3 I2 5960 /// I3 5961 /// RET 5962 /// 5963 /// * Call construction overhead: 1 (BL) 5964 /// * Frame construction overhead: 1 (RET) 5965 /// * Requires stack fixups? No 5966 /// 5967 /// \p MachineOutlinerThunk implies that the function is being created from 5968 /// a sequence of instructions ending in a call. The outlined function is 5969 /// called with a BL instruction, and the outlined function tail-calls the 5970 /// original call destination. 5971 /// 5972 /// That is, 5973 /// 5974 /// I1 OUTLINED_FUNCTION: 5975 /// I2 --> BL OUTLINED_FUNCTION I1 5976 /// BL f I2 5977 /// B f 5978 /// * Call construction overhead: 1 (BL) 5979 /// * Frame construction overhead: 0 5980 /// * Requires stack fixups? No 5981 /// 5982 /// \p MachineOutlinerRegSave implies that the function should be called with a 5983 /// save and restore of LR to an available register. This allows us to avoid 5984 /// stack fixups. Note that this outlining variant is compatible with the 5985 /// NoLRSave case. 5986 /// 5987 /// That is, 5988 /// 5989 /// I1 Save LR OUTLINED_FUNCTION: 5990 /// I2 --> BL OUTLINED_FUNCTION I1 5991 /// I3 Restore LR I2 5992 /// I3 5993 /// RET 5994 /// 5995 /// * Call construction overhead: 3 (save + BL + restore) 5996 /// * Frame construction overhead: 1 (ret) 5997 /// * Requires stack fixups? No 5998 enum MachineOutlinerClass { 5999 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 6000 MachineOutlinerTailCall, /// Only emit a branch. 6001 MachineOutlinerNoLRSave, /// Emit a call and return. 6002 MachineOutlinerThunk, /// Emit a call and tail-call. 6003 MachineOutlinerRegSave /// Same as default, but save to a register. 6004 }; 6005 6006 enum MachineOutlinerMBBFlags { 6007 LRUnavailableSomewhere = 0x2, 6008 HasCalls = 0x4, 6009 UnsafeRegsDead = 0x8 6010 }; 6011 6012 unsigned 6013 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 6014 assert(C.LRUWasSet && "LRU wasn't set?"); 6015 MachineFunction *MF = C.getMF(); 6016 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6017 MF->getSubtarget().getRegisterInfo()); 6018 6019 // Check if there is an available register across the sequence that we can 6020 // use. 6021 for (unsigned Reg : AArch64::GPR64RegClass) { 6022 if (!ARI->isReservedReg(*MF, Reg) && 6023 Reg != AArch64::LR && // LR is not reserved, but don't use it. 6024 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 6025 Reg != AArch64::X17 && // Ditto for X17. 6026 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 6027 return Reg; 6028 } 6029 6030 // No suitable register. Return 0. 6031 return 0u; 6032 } 6033 6034 static bool 6035 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 6036 const outliner::Candidate &b) { 6037 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6038 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6039 6040 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 6041 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 6042 } 6043 6044 static bool 6045 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 6046 const outliner::Candidate &b) { 6047 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6048 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6049 6050 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 6051 } 6052 6053 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 6054 const outliner::Candidate &b) { 6055 const AArch64Subtarget &SubtargetA = 6056 a.getMF()->getSubtarget<AArch64Subtarget>(); 6057 const AArch64Subtarget &SubtargetB = 6058 b.getMF()->getSubtarget<AArch64Subtarget>(); 6059 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 6060 } 6061 6062 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 6063 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 6064 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 6065 unsigned SequenceSize = 6066 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 6067 [this](unsigned Sum, const MachineInstr &MI) { 6068 return Sum + getInstSizeInBytes(MI); 6069 }); 6070 unsigned NumBytesToCreateFrame = 0; 6071 6072 // We only allow outlining for functions having exactly matching return 6073 // address signing attributes, i.e., all share the same value for the 6074 // attribute "sign-return-address" and all share the same type of key they 6075 // are signed with. 6076 // Additionally we require all functions to simultaniously either support 6077 // v8.3a features or not. Otherwise an outlined function could get signed 6078 // using dedicated v8.3 instructions and a call from a function that doesn't 6079 // support v8.3 instructions would therefore be invalid. 6080 if (std::adjacent_find( 6081 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6082 [](const outliner::Candidate &a, const outliner::Candidate &b) { 6083 // Return true if a and b are non-equal w.r.t. return address 6084 // signing or support of v8.3a features 6085 if (outliningCandidatesSigningScopeConsensus(a, b) && 6086 outliningCandidatesSigningKeyConsensus(a, b) && 6087 outliningCandidatesV8_3OpsConsensus(a, b)) { 6088 return false; 6089 } 6090 return true; 6091 }) != RepeatedSequenceLocs.end()) { 6092 return outliner::OutlinedFunction(); 6093 } 6094 6095 // Since at this point all candidates agree on their return address signing 6096 // picking just one is fine. If the candidate functions potentially sign their 6097 // return addresses, the outlined function should do the same. Note that in 6098 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 6099 // not certainly true that the outlined function will have to sign its return 6100 // address but this decision is made later, when the decision to outline 6101 // has already been made. 6102 // The same holds for the number of additional instructions we need: On 6103 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 6104 // necessary. However, at this point we don't know if the outlined function 6105 // will have a RET instruction so we assume the worst. 6106 const TargetRegisterInfo &TRI = getRegisterInfo(); 6107 if (FirstCand.getMF() 6108 ->getInfo<AArch64FunctionInfo>() 6109 ->shouldSignReturnAddress(true)) { 6110 // One PAC and one AUT instructions 6111 NumBytesToCreateFrame += 8; 6112 6113 // We have to check if sp modifying instructions would get outlined. 6114 // If so we only allow outlining if sp is unchanged overall, so matching 6115 // sub and add instructions are okay to outline, all other sp modifications 6116 // are not 6117 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 6118 int SPValue = 0; 6119 MachineBasicBlock::iterator MBBI = C.front(); 6120 for (;;) { 6121 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 6122 switch (MBBI->getOpcode()) { 6123 case AArch64::ADDXri: 6124 case AArch64::ADDWri: 6125 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6126 assert(MBBI->getOperand(2).isImm() && 6127 "Expected operand to be immediate"); 6128 assert(MBBI->getOperand(1).isReg() && 6129 "Expected operand to be a register"); 6130 // Check if the add just increments sp. If so, we search for 6131 // matching sub instructions that decrement sp. If not, the 6132 // modification is illegal 6133 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6134 SPValue += MBBI->getOperand(2).getImm(); 6135 else 6136 return true; 6137 break; 6138 case AArch64::SUBXri: 6139 case AArch64::SUBWri: 6140 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6141 assert(MBBI->getOperand(2).isImm() && 6142 "Expected operand to be immediate"); 6143 assert(MBBI->getOperand(1).isReg() && 6144 "Expected operand to be a register"); 6145 // Check if the sub just decrements sp. If so, we search for 6146 // matching add instructions that increment sp. If not, the 6147 // modification is illegal 6148 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6149 SPValue -= MBBI->getOperand(2).getImm(); 6150 else 6151 return true; 6152 break; 6153 default: 6154 return true; 6155 } 6156 } 6157 if (MBBI == C.back()) 6158 break; 6159 ++MBBI; 6160 } 6161 if (SPValue) 6162 return true; 6163 return false; 6164 }; 6165 // Remove candidates with illegal stack modifying instructions 6166 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 6167 6168 // If the sequence doesn't have enough candidates left, then we're done. 6169 if (RepeatedSequenceLocs.size() < 2) 6170 return outliner::OutlinedFunction(); 6171 } 6172 6173 // Properties about candidate MBBs that hold for all of them. 6174 unsigned FlagsSetInAll = 0xF; 6175 6176 // Compute liveness information for each candidate, and set FlagsSetInAll. 6177 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6178 [&FlagsSetInAll](outliner::Candidate &C) { 6179 FlagsSetInAll &= C.Flags; 6180 }); 6181 6182 // According to the AArch64 Procedure Call Standard, the following are 6183 // undefined on entry/exit from a function call: 6184 // 6185 // * Registers x16, x17, (and thus w16, w17) 6186 // * Condition codes (and thus the NZCV register) 6187 // 6188 // Because if this, we can't outline any sequence of instructions where 6189 // one 6190 // of these registers is live into/across it. Thus, we need to delete 6191 // those 6192 // candidates. 6193 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 6194 // If the unsafe registers in this block are all dead, then we don't need 6195 // to compute liveness here. 6196 if (C.Flags & UnsafeRegsDead) 6197 return false; 6198 C.initLRU(TRI); 6199 LiveRegUnits LRU = C.LRU; 6200 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 6201 !LRU.available(AArch64::NZCV)); 6202 }; 6203 6204 // Are there any candidates where those registers are live? 6205 if (!(FlagsSetInAll & UnsafeRegsDead)) { 6206 // Erase every candidate that violates the restrictions above. (It could be 6207 // true that we have viable candidates, so it's not worth bailing out in 6208 // the case that, say, 1 out of 20 candidates violate the restructions.) 6209 llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); 6210 6211 // If the sequence doesn't have enough candidates left, then we're done. 6212 if (RepeatedSequenceLocs.size() < 2) 6213 return outliner::OutlinedFunction(); 6214 } 6215 6216 // At this point, we have only "safe" candidates to outline. Figure out 6217 // frame + call instruction information. 6218 6219 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 6220 6221 // Helper lambda which sets call information for every candidate. 6222 auto SetCandidateCallInfo = 6223 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 6224 for (outliner::Candidate &C : RepeatedSequenceLocs) 6225 C.setCallInfo(CallID, NumBytesForCall); 6226 }; 6227 6228 unsigned FrameID = MachineOutlinerDefault; 6229 NumBytesToCreateFrame += 4; 6230 6231 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 6232 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 6233 }); 6234 6235 // We check to see if CFI Instructions are present, and if they are 6236 // we find the number of CFI Instructions in the candidates. 6237 unsigned CFICount = 0; 6238 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); 6239 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); 6240 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { 6241 const std::vector<MCCFIInstruction> &CFIInstructions = 6242 RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); 6243 if (MBBI->isCFIInstruction()) { 6244 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); 6245 MCCFIInstruction CFI = CFIInstructions[CFIIndex]; 6246 CFICount++; 6247 } 6248 MBBI++; 6249 } 6250 6251 // We compare the number of found CFI Instructions to the number of CFI 6252 // instructions in the parent function for each candidate. We must check this 6253 // since if we outline one of the CFI instructions in a function, we have to 6254 // outline them all for correctness. If we do not, the address offsets will be 6255 // incorrect between the two sections of the program. 6256 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6257 std::vector<MCCFIInstruction> CFIInstructions = 6258 C.getMF()->getFrameInstructions(); 6259 6260 if (CFICount > 0 && CFICount != CFIInstructions.size()) 6261 return outliner::OutlinedFunction(); 6262 } 6263 6264 // Returns true if an instructions is safe to fix up, false otherwise. 6265 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 6266 if (MI.isCall()) 6267 return true; 6268 6269 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 6270 !MI.readsRegister(AArch64::SP, &TRI)) 6271 return true; 6272 6273 // Any modification of SP will break our code to save/restore LR. 6274 // FIXME: We could handle some instructions which add a constant 6275 // offset to SP, with a bit more work. 6276 if (MI.modifiesRegister(AArch64::SP, &TRI)) 6277 return false; 6278 6279 // At this point, we have a stack instruction that we might need to 6280 // fix up. We'll handle it if it's a load or store. 6281 if (MI.mayLoadOrStore()) { 6282 const MachineOperand *Base; // Filled with the base operand of MI. 6283 int64_t Offset; // Filled with the offset of MI. 6284 bool OffsetIsScalable; 6285 6286 // Does it allow us to offset the base operand and is the base the 6287 // register SP? 6288 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 6289 !Base->isReg() || Base->getReg() != AArch64::SP) 6290 return false; 6291 6292 // Fixe-up code below assumes bytes. 6293 if (OffsetIsScalable) 6294 return false; 6295 6296 // Find the minimum/maximum offset for this instruction and check 6297 // if fixing it up would be in range. 6298 int64_t MinOffset, 6299 MaxOffset; // Unscaled offsets for the instruction. 6300 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 6301 unsigned DummyWidth; 6302 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 6303 6304 Offset += 16; // Update the offset to what it would be if we outlined. 6305 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || 6306 Offset > MaxOffset * (int64_t)Scale.getFixedSize()) 6307 return false; 6308 6309 // It's in range, so we can outline it. 6310 return true; 6311 } 6312 6313 // FIXME: Add handling for instructions like "add x0, sp, #8". 6314 6315 // We can't fix it up, so don't outline it. 6316 return false; 6317 }; 6318 6319 // True if it's possible to fix up each stack instruction in this sequence. 6320 // Important for frames/call variants that modify the stack. 6321 bool AllStackInstrsSafe = std::all_of( 6322 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 6323 6324 // If the last instruction in any candidate is a terminator, then we should 6325 // tail call all of the candidates. 6326 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 6327 FrameID = MachineOutlinerTailCall; 6328 NumBytesToCreateFrame = 0; 6329 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 6330 } 6331 6332 else if (LastInstrOpcode == AArch64::BL || 6333 ((LastInstrOpcode == AArch64::BLR || 6334 LastInstrOpcode == AArch64::BLRNoIP) && 6335 !HasBTI)) { 6336 // FIXME: Do we need to check if the code after this uses the value of LR? 6337 FrameID = MachineOutlinerThunk; 6338 NumBytesToCreateFrame = 0; 6339 SetCandidateCallInfo(MachineOutlinerThunk, 4); 6340 } 6341 6342 else { 6343 // We need to decide how to emit calls + frames. We can always emit the same 6344 // frame if we don't need to save to the stack. If we have to save to the 6345 // stack, then we need a different frame. 6346 unsigned NumBytesNoStackCalls = 0; 6347 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 6348 6349 // Check if we have to save LR. 6350 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6351 C.initLRU(TRI); 6352 6353 // If we have a noreturn caller, then we're going to be conservative and 6354 // say that we have to save LR. If we don't have a ret at the end of the 6355 // block, then we can't reason about liveness accurately. 6356 // 6357 // FIXME: We can probably do better than always disabling this in 6358 // noreturn functions by fixing up the liveness info. 6359 bool IsNoReturn = 6360 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 6361 6362 // Is LR available? If so, we don't need a save. 6363 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 6364 NumBytesNoStackCalls += 4; 6365 C.setCallInfo(MachineOutlinerNoLRSave, 4); 6366 CandidatesWithoutStackFixups.push_back(C); 6367 } 6368 6369 // Is an unused register available? If so, we won't modify the stack, so 6370 // we can outline with the same frame type as those that don't save LR. 6371 else if (findRegisterToSaveLRTo(C)) { 6372 NumBytesNoStackCalls += 12; 6373 C.setCallInfo(MachineOutlinerRegSave, 12); 6374 CandidatesWithoutStackFixups.push_back(C); 6375 } 6376 6377 // Is SP used in the sequence at all? If not, we don't have to modify 6378 // the stack, so we are guaranteed to get the same frame. 6379 else if (C.UsedInSequence.available(AArch64::SP)) { 6380 NumBytesNoStackCalls += 12; 6381 C.setCallInfo(MachineOutlinerDefault, 12); 6382 CandidatesWithoutStackFixups.push_back(C); 6383 } 6384 6385 // If we outline this, we need to modify the stack. Pretend we don't 6386 // outline this by saving all of its bytes. 6387 else { 6388 NumBytesNoStackCalls += SequenceSize; 6389 } 6390 } 6391 6392 // If there are no places where we have to save LR, then note that we 6393 // don't have to update the stack. Otherwise, give every candidate the 6394 // default call type, as long as it's safe to do so. 6395 if (!AllStackInstrsSafe || 6396 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 6397 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 6398 FrameID = MachineOutlinerNoLRSave; 6399 } else { 6400 SetCandidateCallInfo(MachineOutlinerDefault, 12); 6401 6402 // Bugzilla ID: 46767 6403 // TODO: Check if fixing up the stack more than once is safe so we can 6404 // outline these. 6405 // 6406 // An outline resulting in a caller that requires stack fixups at the 6407 // callsite to a callee that also requires stack fixups can happen when 6408 // there are no available registers at the candidate callsite for a 6409 // candidate that itself also has calls. 6410 // 6411 // In other words if function_containing_sequence in the following pseudo 6412 // assembly requires that we save LR at the point of the call, but there 6413 // are no available registers: in this case we save using SP and as a 6414 // result the SP offsets requires stack fixups by multiples of 16. 6415 // 6416 // function_containing_sequence: 6417 // ... 6418 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6419 // call OUTLINED_FUNCTION_N 6420 // restore LR from SP 6421 // ... 6422 // 6423 // OUTLINED_FUNCTION_N: 6424 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6425 // ... 6426 // bl foo 6427 // restore LR from SP 6428 // ret 6429 // 6430 // Because the code to handle more than one stack fixup does not 6431 // currently have the proper checks for legality, these cases will assert 6432 // in the AArch64 MachineOutliner. This is because the code to do this 6433 // needs more hardening, testing, better checks that generated code is 6434 // legal, etc and because it is only verified to handle a single pass of 6435 // stack fixup. 6436 // 6437 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 6438 // these cases until they are known to be handled. Bugzilla 46767 is 6439 // referenced in comments at the assert site. 6440 // 6441 // To avoid asserting (or generating non-legal code on noassert builds) 6442 // we remove all candidates which would need more than one stack fixup by 6443 // pruning the cases where the candidate has calls while also having no 6444 // available LR and having no available general purpose registers to copy 6445 // LR to (ie one extra stack save/restore). 6446 // 6447 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6448 erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { 6449 return (std::any_of( 6450 C.front(), std::next(C.back()), 6451 [](const MachineInstr &MI) { return MI.isCall(); })) && 6452 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); 6453 }); 6454 } 6455 } 6456 6457 // If we dropped all of the candidates, bail out here. 6458 if (RepeatedSequenceLocs.size() < 2) { 6459 RepeatedSequenceLocs.clear(); 6460 return outliner::OutlinedFunction(); 6461 } 6462 } 6463 6464 // Does every candidate's MBB contain a call? If so, then we might have a call 6465 // in the range. 6466 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6467 // Check if the range contains a call. These require a save + restore of the 6468 // link register. 6469 bool ModStackToSaveLR = false; 6470 if (std::any_of(FirstCand.front(), FirstCand.back(), 6471 [](const MachineInstr &MI) { return MI.isCall(); })) 6472 ModStackToSaveLR = true; 6473 6474 // Handle the last instruction separately. If this is a tail call, then the 6475 // last instruction is a call. We don't want to save + restore in this case. 6476 // However, it could be possible that the last instruction is a call without 6477 // it being valid to tail call this sequence. We should consider this as 6478 // well. 6479 else if (FrameID != MachineOutlinerThunk && 6480 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 6481 ModStackToSaveLR = true; 6482 6483 if (ModStackToSaveLR) { 6484 // We can't fix up the stack. Bail out. 6485 if (!AllStackInstrsSafe) { 6486 RepeatedSequenceLocs.clear(); 6487 return outliner::OutlinedFunction(); 6488 } 6489 6490 // Save + restore LR. 6491 NumBytesToCreateFrame += 8; 6492 } 6493 } 6494 6495 // If we have CFI instructions, we can only outline if the outlined section 6496 // can be a tail call 6497 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 6498 return outliner::OutlinedFunction(); 6499 6500 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 6501 NumBytesToCreateFrame, FrameID); 6502 } 6503 6504 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 6505 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 6506 const Function &F = MF.getFunction(); 6507 6508 // Can F be deduplicated by the linker? If it can, don't outline from it. 6509 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 6510 return false; 6511 6512 // Don't outline from functions with section markings; the program could 6513 // expect that all the code is in the named section. 6514 // FIXME: Allow outlining from multiple functions with the same section 6515 // marking. 6516 if (F.hasSection()) 6517 return false; 6518 6519 // Outlining from functions with redzones is unsafe since the outliner may 6520 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 6521 // outline from it. 6522 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 6523 if (!AFI || AFI->hasRedZone().getValueOr(true)) 6524 return false; 6525 6526 // FIXME: Teach the outliner to generate/handle Windows unwind info. 6527 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 6528 return false; 6529 6530 // It's safe to outline from MF. 6531 return true; 6532 } 6533 6534 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 6535 unsigned &Flags) const { 6536 // Check if LR is available through all of the MBB. If it's not, then set 6537 // a flag. 6538 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 6539 "Suitable Machine Function for outlining must track liveness"); 6540 LiveRegUnits LRU(getRegisterInfo()); 6541 6542 std::for_each(MBB.rbegin(), MBB.rend(), 6543 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6544 6545 // Check if each of the unsafe registers are available... 6546 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6547 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6548 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6549 6550 // If all of these are dead (and not live out), we know we don't have to check 6551 // them later. 6552 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6553 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6554 6555 // Now, add the live outs to the set. 6556 LRU.addLiveOuts(MBB); 6557 6558 // If any of these registers is available in the MBB, but also a live out of 6559 // the block, then we know outlining is unsafe. 6560 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6561 return false; 6562 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6563 return false; 6564 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6565 return false; 6566 6567 // Check if there's a call inside this MachineBasicBlock. If there is, then 6568 // set a flag. 6569 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6570 Flags |= MachineOutlinerMBBFlags::HasCalls; 6571 6572 MachineFunction *MF = MBB.getParent(); 6573 6574 // In the event that we outline, we may have to save LR. If there is an 6575 // available register in the MBB, then we'll always save LR there. Check if 6576 // this is true. 6577 bool CanSaveLR = false; 6578 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6579 MF->getSubtarget().getRegisterInfo()); 6580 6581 // Check if there is an available register across the sequence that we can 6582 // use. 6583 for (unsigned Reg : AArch64::GPR64RegClass) { 6584 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6585 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6586 CanSaveLR = true; 6587 break; 6588 } 6589 } 6590 6591 // Check if we have a register we can save LR to, and if LR was used 6592 // somewhere. If both of those things are true, then we need to evaluate the 6593 // safety of outlining stack instructions later. 6594 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6595 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6596 6597 return true; 6598 } 6599 6600 outliner::InstrType 6601 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6602 unsigned Flags) const { 6603 MachineInstr &MI = *MIT; 6604 MachineBasicBlock *MBB = MI.getParent(); 6605 MachineFunction *MF = MBB->getParent(); 6606 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6607 6608 // Don't outline anything used for return address signing. The outlined 6609 // function will get signed later if needed 6610 switch (MI.getOpcode()) { 6611 case AArch64::PACIASP: 6612 case AArch64::PACIBSP: 6613 case AArch64::AUTIASP: 6614 case AArch64::AUTIBSP: 6615 case AArch64::RETAA: 6616 case AArch64::RETAB: 6617 case AArch64::EMITBKEY: 6618 return outliner::InstrType::Illegal; 6619 } 6620 6621 // Don't outline LOHs. 6622 if (FuncInfo->getLOHRelated().count(&MI)) 6623 return outliner::InstrType::Illegal; 6624 6625 // We can only outline these if we will tail call the outlined function, or 6626 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 6627 // in a tail call. 6628 // 6629 // FIXME: If the proper fixups for the offset are implemented, this should be 6630 // possible. 6631 if (MI.isCFIInstruction()) 6632 return outliner::InstrType::Legal; 6633 6634 // Don't allow debug values to impact outlining type. 6635 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 6636 return outliner::InstrType::Invisible; 6637 6638 // At this point, KILL instructions don't really tell us much so we can go 6639 // ahead and skip over them. 6640 if (MI.isKill()) 6641 return outliner::InstrType::Invisible; 6642 6643 // Is this a terminator for a basic block? 6644 if (MI.isTerminator()) { 6645 6646 // Is this the end of a function? 6647 if (MI.getParent()->succ_empty()) 6648 return outliner::InstrType::Legal; 6649 6650 // It's not, so don't outline it. 6651 return outliner::InstrType::Illegal; 6652 } 6653 6654 // Make sure none of the operands are un-outlinable. 6655 for (const MachineOperand &MOP : MI.operands()) { 6656 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 6657 MOP.isTargetIndex()) 6658 return outliner::InstrType::Illegal; 6659 6660 // If it uses LR or W30 explicitly, then don't touch it. 6661 if (MOP.isReg() && !MOP.isImplicit() && 6662 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 6663 return outliner::InstrType::Illegal; 6664 } 6665 6666 // Special cases for instructions that can always be outlined, but will fail 6667 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 6668 // be outlined because they don't require a *specific* value to be in LR. 6669 if (MI.getOpcode() == AArch64::ADRP) 6670 return outliner::InstrType::Legal; 6671 6672 // If MI is a call we might be able to outline it. We don't want to outline 6673 // any calls that rely on the position of items on the stack. When we outline 6674 // something containing a call, we have to emit a save and restore of LR in 6675 // the outlined function. Currently, this always happens by saving LR to the 6676 // stack. Thus, if we outline, say, half the parameters for a function call 6677 // plus the call, then we'll break the callee's expectations for the layout 6678 // of the stack. 6679 // 6680 // FIXME: Allow calls to functions which construct a stack frame, as long 6681 // as they don't access arguments on the stack. 6682 // FIXME: Figure out some way to analyze functions defined in other modules. 6683 // We should be able to compute the memory usage based on the IR calling 6684 // convention, even if we can't see the definition. 6685 if (MI.isCall()) { 6686 // Get the function associated with the call. Look at each operand and find 6687 // the one that represents the callee and get its name. 6688 const Function *Callee = nullptr; 6689 for (const MachineOperand &MOP : MI.operands()) { 6690 if (MOP.isGlobal()) { 6691 Callee = dyn_cast<Function>(MOP.getGlobal()); 6692 break; 6693 } 6694 } 6695 6696 // Never outline calls to mcount. There isn't any rule that would require 6697 // this, but the Linux kernel's "ftrace" feature depends on it. 6698 if (Callee && Callee->getName() == "\01_mcount") 6699 return outliner::InstrType::Illegal; 6700 6701 // If we don't know anything about the callee, assume it depends on the 6702 // stack layout of the caller. In that case, it's only legal to outline 6703 // as a tail-call. Explicitly list the call instructions we know about so we 6704 // don't get unexpected results with call pseudo-instructions. 6705 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 6706 if (MI.getOpcode() == AArch64::BLR || 6707 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 6708 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 6709 6710 if (!Callee) 6711 return UnknownCallOutlineType; 6712 6713 // We have a function we have information about. Check it if it's something 6714 // can safely outline. 6715 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 6716 6717 // We don't know what's going on with the callee at all. Don't touch it. 6718 if (!CalleeMF) 6719 return UnknownCallOutlineType; 6720 6721 // Check if we know anything about the callee saves on the function. If we 6722 // don't, then don't touch it, since that implies that we haven't 6723 // computed anything about its stack frame yet. 6724 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 6725 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 6726 MFI.getNumObjects() > 0) 6727 return UnknownCallOutlineType; 6728 6729 // At this point, we can say that CalleeMF ought to not pass anything on the 6730 // stack. Therefore, we can outline it. 6731 return outliner::InstrType::Legal; 6732 } 6733 6734 // Don't outline positions. 6735 if (MI.isPosition()) 6736 return outliner::InstrType::Illegal; 6737 6738 // Don't touch the link register or W30. 6739 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 6740 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 6741 return outliner::InstrType::Illegal; 6742 6743 // Don't outline BTI instructions, because that will prevent the outlining 6744 // site from being indirectly callable. 6745 if (MI.getOpcode() == AArch64::HINT) { 6746 int64_t Imm = MI.getOperand(0).getImm(); 6747 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 6748 return outliner::InstrType::Illegal; 6749 } 6750 6751 return outliner::InstrType::Legal; 6752 } 6753 6754 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 6755 for (MachineInstr &MI : MBB) { 6756 const MachineOperand *Base; 6757 unsigned Width; 6758 int64_t Offset; 6759 bool OffsetIsScalable; 6760 6761 // Is this a load or store with an immediate offset with SP as the base? 6762 if (!MI.mayLoadOrStore() || 6763 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 6764 &RI) || 6765 (Base->isReg() && Base->getReg() != AArch64::SP)) 6766 continue; 6767 6768 // It is, so we have to fix it up. 6769 TypeSize Scale(0U, false); 6770 int64_t Dummy1, Dummy2; 6771 6772 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 6773 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 6774 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 6775 assert(Scale != 0 && "Unexpected opcode!"); 6776 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 6777 6778 // We've pushed the return address to the stack, so add 16 to the offset. 6779 // This is safe, since we already checked if it would overflow when we 6780 // checked if this instruction was legal to outline. 6781 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); 6782 StackOffsetOperand.setImm(NewImm); 6783 } 6784 } 6785 6786 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 6787 bool ShouldSignReturnAddr, 6788 bool ShouldSignReturnAddrWithAKey) { 6789 if (ShouldSignReturnAddr) { 6790 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 6791 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 6792 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 6793 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 6794 DebugLoc DL; 6795 6796 if (MBBAUT != MBB.end()) 6797 DL = MBBAUT->getDebugLoc(); 6798 6799 // At the very beginning of the basic block we insert the following 6800 // depending on the key type 6801 // 6802 // a_key: b_key: 6803 // PACIASP EMITBKEY 6804 // CFI_INSTRUCTION PACIBSP 6805 // CFI_INSTRUCTION 6806 if (ShouldSignReturnAddrWithAKey) { 6807 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP)) 6808 .setMIFlag(MachineInstr::FrameSetup); 6809 } else { 6810 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 6811 .setMIFlag(MachineInstr::FrameSetup); 6812 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP)) 6813 .setMIFlag(MachineInstr::FrameSetup); 6814 } 6815 unsigned CFIIndex = 6816 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 6817 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 6818 .addCFIIndex(CFIIndex) 6819 .setMIFlags(MachineInstr::FrameSetup); 6820 6821 // If v8.3a features are available we can replace a RET instruction by 6822 // RETAA or RETAB and omit the AUT instructions 6823 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 6824 MBBAUT->getOpcode() == AArch64::RET) { 6825 BuildMI(MBB, MBBAUT, DL, 6826 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 6827 : AArch64::RETAB)) 6828 .copyImplicitOps(*MBBAUT); 6829 MBB.erase(MBBAUT); 6830 } else { 6831 BuildMI(MBB, MBBAUT, DL, 6832 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 6833 : AArch64::AUTIBSP)) 6834 .setMIFlag(MachineInstr::FrameDestroy); 6835 } 6836 } 6837 } 6838 6839 void AArch64InstrInfo::buildOutlinedFrame( 6840 MachineBasicBlock &MBB, MachineFunction &MF, 6841 const outliner::OutlinedFunction &OF) const { 6842 6843 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 6844 6845 if (OF.FrameConstructionID == MachineOutlinerTailCall) 6846 FI->setOutliningStyle("Tail Call"); 6847 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 6848 // For thunk outlining, rewrite the last instruction from a call to a 6849 // tail-call. 6850 MachineInstr *Call = &*--MBB.instr_end(); 6851 unsigned TailOpcode; 6852 if (Call->getOpcode() == AArch64::BL) { 6853 TailOpcode = AArch64::TCRETURNdi; 6854 } else { 6855 assert(Call->getOpcode() == AArch64::BLR || 6856 Call->getOpcode() == AArch64::BLRNoIP); 6857 TailOpcode = AArch64::TCRETURNriALL; 6858 } 6859 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 6860 .add(Call->getOperand(0)) 6861 .addImm(0); 6862 MBB.insert(MBB.end(), TC); 6863 Call->eraseFromParent(); 6864 6865 FI->setOutliningStyle("Thunk"); 6866 } 6867 6868 bool IsLeafFunction = true; 6869 6870 // Is there a call in the outlined range? 6871 auto IsNonTailCall = [](const MachineInstr &MI) { 6872 return MI.isCall() && !MI.isReturn(); 6873 }; 6874 6875 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 6876 // Fix up the instructions in the range, since we're going to modify the 6877 // stack. 6878 6879 // Bugzilla ID: 46767 6880 // TODO: Check if fixing up twice is safe so we can outline these. 6881 assert(OF.FrameConstructionID != MachineOutlinerDefault && 6882 "Can only fix up stack references once"); 6883 fixupPostOutline(MBB); 6884 6885 IsLeafFunction = false; 6886 6887 // LR has to be a live in so that we can save it. 6888 if (!MBB.isLiveIn(AArch64::LR)) 6889 MBB.addLiveIn(AArch64::LR); 6890 6891 MachineBasicBlock::iterator It = MBB.begin(); 6892 MachineBasicBlock::iterator Et = MBB.end(); 6893 6894 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6895 OF.FrameConstructionID == MachineOutlinerThunk) 6896 Et = std::prev(MBB.end()); 6897 6898 // Insert a save before the outlined region 6899 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 6900 .addReg(AArch64::SP, RegState::Define) 6901 .addReg(AArch64::LR) 6902 .addReg(AArch64::SP) 6903 .addImm(-16); 6904 It = MBB.insert(It, STRXpre); 6905 6906 const TargetSubtargetInfo &STI = MF.getSubtarget(); 6907 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 6908 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 6909 6910 // Add a CFI saying the stack was moved 16 B down. 6911 int64_t StackPosEntry = 6912 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 6913 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6914 .addCFIIndex(StackPosEntry) 6915 .setMIFlags(MachineInstr::FrameSetup); 6916 6917 // Add a CFI saying that the LR that we want to find is now 16 B higher than 6918 // before. 6919 int64_t LRPosEntry = 6920 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 6921 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6922 .addCFIIndex(LRPosEntry) 6923 .setMIFlags(MachineInstr::FrameSetup); 6924 6925 // Insert a restore before the terminator for the function. 6926 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 6927 .addReg(AArch64::SP, RegState::Define) 6928 .addReg(AArch64::LR, RegState::Define) 6929 .addReg(AArch64::SP) 6930 .addImm(16); 6931 Et = MBB.insert(Et, LDRXpost); 6932 } 6933 6934 // If a bunch of candidates reach this point they must agree on their return 6935 // address signing. It is therefore enough to just consider the signing 6936 // behaviour of one of them 6937 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 6938 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 6939 6940 // a_key is the default 6941 bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); 6942 6943 // If this is a tail call outlined function, then there's already a return. 6944 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6945 OF.FrameConstructionID == MachineOutlinerThunk) { 6946 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6947 ShouldSignReturnAddrWithAKey); 6948 return; 6949 } 6950 6951 // It's not a tail call, so we have to insert the return ourselves. 6952 6953 // LR has to be a live in so that we can return to it. 6954 if (!MBB.isLiveIn(AArch64::LR)) 6955 MBB.addLiveIn(AArch64::LR); 6956 6957 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 6958 .addReg(AArch64::LR); 6959 MBB.insert(MBB.end(), ret); 6960 6961 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6962 ShouldSignReturnAddrWithAKey); 6963 6964 FI->setOutliningStyle("Function"); 6965 6966 // Did we have to modify the stack by saving the link register? 6967 if (OF.FrameConstructionID != MachineOutlinerDefault) 6968 return; 6969 6970 // We modified the stack. 6971 // Walk over the basic block and fix up all the stack accesses. 6972 fixupPostOutline(MBB); 6973 } 6974 6975 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 6976 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 6977 MachineFunction &MF, const outliner::Candidate &C) const { 6978 6979 // Are we tail calling? 6980 if (C.CallConstructionID == MachineOutlinerTailCall) { 6981 // If yes, then we can just branch to the label. 6982 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 6983 .addGlobalAddress(M.getNamedValue(MF.getName())) 6984 .addImm(0)); 6985 return It; 6986 } 6987 6988 // Are we saving the link register? 6989 if (C.CallConstructionID == MachineOutlinerNoLRSave || 6990 C.CallConstructionID == MachineOutlinerThunk) { 6991 // No, so just insert the call. 6992 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 6993 .addGlobalAddress(M.getNamedValue(MF.getName()))); 6994 return It; 6995 } 6996 6997 // We want to return the spot where we inserted the call. 6998 MachineBasicBlock::iterator CallPt; 6999 7000 // Instructions for saving and restoring LR around the call instruction we're 7001 // going to insert. 7002 MachineInstr *Save; 7003 MachineInstr *Restore; 7004 // Can we save to a register? 7005 if (C.CallConstructionID == MachineOutlinerRegSave) { 7006 // FIXME: This logic should be sunk into a target-specific interface so that 7007 // we don't have to recompute the register. 7008 unsigned Reg = findRegisterToSaveLRTo(C); 7009 assert(Reg != 0 && "No callee-saved register available?"); 7010 7011 // Save and restore LR from that register. 7012 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 7013 .addReg(AArch64::XZR) 7014 .addReg(AArch64::LR) 7015 .addImm(0); 7016 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 7017 .addReg(AArch64::XZR) 7018 .addReg(Reg) 7019 .addImm(0); 7020 } else { 7021 // We have the default case. Save and restore from SP. 7022 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7023 .addReg(AArch64::SP, RegState::Define) 7024 .addReg(AArch64::LR) 7025 .addReg(AArch64::SP) 7026 .addImm(-16); 7027 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7028 .addReg(AArch64::SP, RegState::Define) 7029 .addReg(AArch64::LR, RegState::Define) 7030 .addReg(AArch64::SP) 7031 .addImm(16); 7032 } 7033 7034 It = MBB.insert(It, Save); 7035 It++; 7036 7037 // Insert the call. 7038 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7039 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7040 CallPt = It; 7041 It++; 7042 7043 It = MBB.insert(It, Restore); 7044 return CallPt; 7045 } 7046 7047 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 7048 MachineFunction &MF) const { 7049 return MF.getFunction().hasMinSize(); 7050 } 7051 7052 Optional<DestSourcePair> 7053 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 7054 7055 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 7056 // and zero immediate operands used as an alias for mov instruction. 7057 if (MI.getOpcode() == AArch64::ORRWrs && 7058 MI.getOperand(1).getReg() == AArch64::WZR && 7059 MI.getOperand(3).getImm() == 0x0) { 7060 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7061 } 7062 7063 if (MI.getOpcode() == AArch64::ORRXrs && 7064 MI.getOperand(1).getReg() == AArch64::XZR && 7065 MI.getOperand(3).getImm() == 0x0) { 7066 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7067 } 7068 7069 return None; 7070 } 7071 7072 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 7073 Register Reg) const { 7074 int Sign = 1; 7075 int64_t Offset = 0; 7076 7077 // TODO: Handle cases where Reg is a super- or sub-register of the 7078 // destination register. 7079 const MachineOperand &Op0 = MI.getOperand(0); 7080 if (!Op0.isReg() || Reg != Op0.getReg()) 7081 return None; 7082 7083 switch (MI.getOpcode()) { 7084 default: 7085 return None; 7086 case AArch64::SUBWri: 7087 case AArch64::SUBXri: 7088 case AArch64::SUBSWri: 7089 case AArch64::SUBSXri: 7090 Sign *= -1; 7091 LLVM_FALLTHROUGH; 7092 case AArch64::ADDSWri: 7093 case AArch64::ADDSXri: 7094 case AArch64::ADDWri: 7095 case AArch64::ADDXri: { 7096 // TODO: Third operand can be global address (usually some string). 7097 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 7098 !MI.getOperand(2).isImm()) 7099 return None; 7100 int Shift = MI.getOperand(3).getImm(); 7101 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 7102 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 7103 } 7104 } 7105 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 7106 } 7107 7108 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 7109 /// the destination register then, if possible, describe the value in terms of 7110 /// the source register. 7111 static Optional<ParamLoadedValue> 7112 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 7113 const TargetInstrInfo *TII, 7114 const TargetRegisterInfo *TRI) { 7115 auto DestSrc = TII->isCopyInstr(MI); 7116 if (!DestSrc) 7117 return None; 7118 7119 Register DestReg = DestSrc->Destination->getReg(); 7120 Register SrcReg = DestSrc->Source->getReg(); 7121 7122 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 7123 7124 // If the described register is the destination, just return the source. 7125 if (DestReg == DescribedReg) 7126 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7127 7128 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 7129 if (MI.getOpcode() == AArch64::ORRWrs && 7130 TRI->isSuperRegister(DestReg, DescribedReg)) 7131 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7132 7133 // We may need to describe the lower part of a ORRXrs move. 7134 if (MI.getOpcode() == AArch64::ORRXrs && 7135 TRI->isSubRegister(DestReg, DescribedReg)) { 7136 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 7137 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 7138 } 7139 7140 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 7141 "Unhandled ORR[XW]rs copy case"); 7142 7143 return None; 7144 } 7145 7146 Optional<ParamLoadedValue> 7147 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 7148 Register Reg) const { 7149 const MachineFunction *MF = MI.getMF(); 7150 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 7151 switch (MI.getOpcode()) { 7152 case AArch64::MOVZWi: 7153 case AArch64::MOVZXi: { 7154 // MOVZWi may be used for producing zero-extended 32-bit immediates in 7155 // 64-bit parameters, so we need to consider super-registers. 7156 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 7157 return None; 7158 7159 if (!MI.getOperand(1).isImm()) 7160 return None; 7161 int64_t Immediate = MI.getOperand(1).getImm(); 7162 int Shift = MI.getOperand(2).getImm(); 7163 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 7164 nullptr); 7165 } 7166 case AArch64::ORRWrs: 7167 case AArch64::ORRXrs: 7168 return describeORRLoadedValue(MI, Reg, this, TRI); 7169 } 7170 7171 return TargetInstrInfo::describeLoadedValue(MI, Reg); 7172 } 7173 7174 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 7175 return get(Opc).TSFlags & AArch64::ElementSizeMask; 7176 } 7177 7178 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 7179 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 7180 } 7181 7182 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 7183 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 7184 } 7185 7186 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 7187 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 7188 return AArch64::BLRNoIP; 7189 else 7190 return AArch64::BLR; 7191 } 7192 7193 #define GET_INSTRINFO_HELPERS 7194 #define GET_INSTRMAP_INFO 7195 #include "AArch64GenInstrInfo.inc" 7196