1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineOperand.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/MachineModuleInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/GlobalValue.h" 35 #include "llvm/MC/MCAsmInfo.h" 36 #include "llvm/MC/MCInst.h" 37 #include "llvm/MC/MCInstrDesc.h" 38 #include "llvm/Support/Casting.h" 39 #include "llvm/Support/CodeGen.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/Compiler.h" 42 #include "llvm/Support/ErrorHandling.h" 43 #include "llvm/Support/MathExtras.h" 44 #include "llvm/Target/TargetMachine.h" 45 #include "llvm/Target/TargetOptions.h" 46 #include <cassert> 47 #include <cstdint> 48 #include <iterator> 49 #include <utility> 50 51 using namespace llvm; 52 53 #define GET_INSTRINFO_CTOR_DTOR 54 #include "AArch64GenInstrInfo.inc" 55 56 static cl::opt<unsigned> TBZDisplacementBits( 57 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 58 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 59 60 static cl::opt<unsigned> CBZDisplacementBits( 61 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 62 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 63 64 static cl::opt<unsigned> 65 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 66 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 67 68 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 69 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 70 AArch64::CATCHRET), 71 RI(STI.getTargetTriple()), Subtarget(STI) {} 72 73 /// GetInstSize - Return the number of bytes of code the specified 74 /// instruction may be. This returns the maximum number of bytes. 75 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 76 const MachineBasicBlock &MBB = *MI.getParent(); 77 const MachineFunction *MF = MBB.getParent(); 78 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 79 80 { 81 auto Op = MI.getOpcode(); 82 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 83 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 84 } 85 86 // Meta-instructions emit no code. 87 if (MI.isMetaInstruction()) 88 return 0; 89 90 // FIXME: We currently only handle pseudoinstructions that don't get expanded 91 // before the assembly printer. 92 unsigned NumBytes = 0; 93 const MCInstrDesc &Desc = MI.getDesc(); 94 switch (Desc.getOpcode()) { 95 default: 96 // Anything not explicitly designated otherwise is a normal 4-byte insn. 97 NumBytes = 4; 98 break; 99 case TargetOpcode::STACKMAP: 100 // The upper bound for a stackmap intrinsic is the full length of its shadow 101 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 102 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 103 break; 104 case TargetOpcode::PATCHPOINT: 105 // The size of the patchpoint intrinsic is the number of bytes requested 106 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 107 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 108 break; 109 case AArch64::TLSDESC_CALLSEQ: 110 // This gets lowered to an instruction sequence which takes 16 bytes 111 NumBytes = 16; 112 break; 113 case AArch64::JumpTableDest32: 114 case AArch64::JumpTableDest16: 115 case AArch64::JumpTableDest8: 116 NumBytes = 12; 117 break; 118 case AArch64::SPACE: 119 NumBytes = MI.getOperand(1).getImm(); 120 break; 121 } 122 123 return NumBytes; 124 } 125 126 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 127 SmallVectorImpl<MachineOperand> &Cond) { 128 // Block ends with fall-through condbranch. 129 switch (LastInst->getOpcode()) { 130 default: 131 llvm_unreachable("Unknown branch instruction?"); 132 case AArch64::Bcc: 133 Target = LastInst->getOperand(1).getMBB(); 134 Cond.push_back(LastInst->getOperand(0)); 135 break; 136 case AArch64::CBZW: 137 case AArch64::CBZX: 138 case AArch64::CBNZW: 139 case AArch64::CBNZX: 140 Target = LastInst->getOperand(1).getMBB(); 141 Cond.push_back(MachineOperand::CreateImm(-1)); 142 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 143 Cond.push_back(LastInst->getOperand(0)); 144 break; 145 case AArch64::TBZW: 146 case AArch64::TBZX: 147 case AArch64::TBNZW: 148 case AArch64::TBNZX: 149 Target = LastInst->getOperand(2).getMBB(); 150 Cond.push_back(MachineOperand::CreateImm(-1)); 151 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 152 Cond.push_back(LastInst->getOperand(0)); 153 Cond.push_back(LastInst->getOperand(1)); 154 } 155 } 156 157 static unsigned getBranchDisplacementBits(unsigned Opc) { 158 switch (Opc) { 159 default: 160 llvm_unreachable("unexpected opcode!"); 161 case AArch64::B: 162 return 64; 163 case AArch64::TBNZW: 164 case AArch64::TBZW: 165 case AArch64::TBNZX: 166 case AArch64::TBZX: 167 return TBZDisplacementBits; 168 case AArch64::CBNZW: 169 case AArch64::CBZW: 170 case AArch64::CBNZX: 171 case AArch64::CBZX: 172 return CBZDisplacementBits; 173 case AArch64::Bcc: 174 return BCCDisplacementBits; 175 } 176 } 177 178 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 179 int64_t BrOffset) const { 180 unsigned Bits = getBranchDisplacementBits(BranchOp); 181 assert(Bits >= 3 && "max branch displacement must be enough to jump" 182 "over conditional branch expansion"); 183 return isIntN(Bits, BrOffset / 4); 184 } 185 186 MachineBasicBlock * 187 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 188 switch (MI.getOpcode()) { 189 default: 190 llvm_unreachable("unexpected opcode!"); 191 case AArch64::B: 192 return MI.getOperand(0).getMBB(); 193 case AArch64::TBZW: 194 case AArch64::TBNZW: 195 case AArch64::TBZX: 196 case AArch64::TBNZX: 197 return MI.getOperand(2).getMBB(); 198 case AArch64::CBZW: 199 case AArch64::CBNZW: 200 case AArch64::CBZX: 201 case AArch64::CBNZX: 202 case AArch64::Bcc: 203 return MI.getOperand(1).getMBB(); 204 } 205 } 206 207 // Branch analysis. 208 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 209 MachineBasicBlock *&TBB, 210 MachineBasicBlock *&FBB, 211 SmallVectorImpl<MachineOperand> &Cond, 212 bool AllowModify) const { 213 // If the block has no terminators, it just falls into the block after it. 214 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 215 if (I == MBB.end()) 216 return false; 217 218 if (!isUnpredicatedTerminator(*I)) 219 return false; 220 221 // Get the last instruction in the block. 222 MachineInstr *LastInst = &*I; 223 224 // If there is only one terminator instruction, process it. 225 unsigned LastOpc = LastInst->getOpcode(); 226 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 227 if (isUncondBranchOpcode(LastOpc)) { 228 TBB = LastInst->getOperand(0).getMBB(); 229 return false; 230 } 231 if (isCondBranchOpcode(LastOpc)) { 232 // Block ends with fall-through condbranch. 233 parseCondBranch(LastInst, TBB, Cond); 234 return false; 235 } 236 return true; // Can't handle indirect branch. 237 } 238 239 // Get the instruction before it if it is a terminator. 240 MachineInstr *SecondLastInst = &*I; 241 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 242 243 // If AllowModify is true and the block ends with two or more unconditional 244 // branches, delete all but the first unconditional branch. 245 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 246 while (isUncondBranchOpcode(SecondLastOpc)) { 247 LastInst->eraseFromParent(); 248 LastInst = SecondLastInst; 249 LastOpc = LastInst->getOpcode(); 250 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 251 // Return now the only terminator is an unconditional branch. 252 TBB = LastInst->getOperand(0).getMBB(); 253 return false; 254 } else { 255 SecondLastInst = &*I; 256 SecondLastOpc = SecondLastInst->getOpcode(); 257 } 258 } 259 } 260 261 // If there are three terminators, we don't know what sort of block this is. 262 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 263 return true; 264 265 // If the block ends with a B and a Bcc, handle it. 266 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 267 parseCondBranch(SecondLastInst, TBB, Cond); 268 FBB = LastInst->getOperand(0).getMBB(); 269 return false; 270 } 271 272 // If the block ends with two unconditional branches, handle it. The second 273 // one is not executed, so remove it. 274 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 275 TBB = SecondLastInst->getOperand(0).getMBB(); 276 I = LastInst; 277 if (AllowModify) 278 I->eraseFromParent(); 279 return false; 280 } 281 282 // ...likewise if it ends with an indirect branch followed by an unconditional 283 // branch. 284 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 285 I = LastInst; 286 if (AllowModify) 287 I->eraseFromParent(); 288 return true; 289 } 290 291 // Otherwise, can't handle this. 292 return true; 293 } 294 295 bool AArch64InstrInfo::reverseBranchCondition( 296 SmallVectorImpl<MachineOperand> &Cond) const { 297 if (Cond[0].getImm() != -1) { 298 // Regular Bcc 299 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 300 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 301 } else { 302 // Folded compare-and-branch 303 switch (Cond[1].getImm()) { 304 default: 305 llvm_unreachable("Unknown conditional branch!"); 306 case AArch64::CBZW: 307 Cond[1].setImm(AArch64::CBNZW); 308 break; 309 case AArch64::CBNZW: 310 Cond[1].setImm(AArch64::CBZW); 311 break; 312 case AArch64::CBZX: 313 Cond[1].setImm(AArch64::CBNZX); 314 break; 315 case AArch64::CBNZX: 316 Cond[1].setImm(AArch64::CBZX); 317 break; 318 case AArch64::TBZW: 319 Cond[1].setImm(AArch64::TBNZW); 320 break; 321 case AArch64::TBNZW: 322 Cond[1].setImm(AArch64::TBZW); 323 break; 324 case AArch64::TBZX: 325 Cond[1].setImm(AArch64::TBNZX); 326 break; 327 case AArch64::TBNZX: 328 Cond[1].setImm(AArch64::TBZX); 329 break; 330 } 331 } 332 333 return false; 334 } 335 336 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 337 int *BytesRemoved) const { 338 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 339 if (I == MBB.end()) 340 return 0; 341 342 if (!isUncondBranchOpcode(I->getOpcode()) && 343 !isCondBranchOpcode(I->getOpcode())) 344 return 0; 345 346 // Remove the branch. 347 I->eraseFromParent(); 348 349 I = MBB.end(); 350 351 if (I == MBB.begin()) { 352 if (BytesRemoved) 353 *BytesRemoved = 4; 354 return 1; 355 } 356 --I; 357 if (!isCondBranchOpcode(I->getOpcode())) { 358 if (BytesRemoved) 359 *BytesRemoved = 4; 360 return 1; 361 } 362 363 // Remove the branch. 364 I->eraseFromParent(); 365 if (BytesRemoved) 366 *BytesRemoved = 8; 367 368 return 2; 369 } 370 371 void AArch64InstrInfo::instantiateCondBranch( 372 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 373 ArrayRef<MachineOperand> Cond) const { 374 if (Cond[0].getImm() != -1) { 375 // Regular Bcc 376 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 377 } else { 378 // Folded compare-and-branch 379 // Note that we use addOperand instead of addReg to keep the flags. 380 const MachineInstrBuilder MIB = 381 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 382 if (Cond.size() > 3) 383 MIB.addImm(Cond[3].getImm()); 384 MIB.addMBB(TBB); 385 } 386 } 387 388 unsigned AArch64InstrInfo::insertBranch( 389 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 390 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 391 // Shouldn't be a fall through. 392 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 393 394 if (!FBB) { 395 if (Cond.empty()) // Unconditional branch? 396 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 397 else 398 instantiateCondBranch(MBB, DL, TBB, Cond); 399 400 if (BytesAdded) 401 *BytesAdded = 4; 402 403 return 1; 404 } 405 406 // Two-way conditional branch. 407 instantiateCondBranch(MBB, DL, TBB, Cond); 408 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 409 410 if (BytesAdded) 411 *BytesAdded = 8; 412 413 return 2; 414 } 415 416 // Find the original register that VReg is copied from. 417 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 418 while (Register::isVirtualRegister(VReg)) { 419 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 420 if (!DefMI->isFullCopy()) 421 return VReg; 422 VReg = DefMI->getOperand(1).getReg(); 423 } 424 return VReg; 425 } 426 427 // Determine if VReg is defined by an instruction that can be folded into a 428 // csel instruction. If so, return the folded opcode, and the replacement 429 // register. 430 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 431 unsigned *NewVReg = nullptr) { 432 VReg = removeCopies(MRI, VReg); 433 if (!Register::isVirtualRegister(VReg)) 434 return 0; 435 436 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 437 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 438 unsigned Opc = 0; 439 unsigned SrcOpNum = 0; 440 switch (DefMI->getOpcode()) { 441 case AArch64::ADDSXri: 442 case AArch64::ADDSWri: 443 // if NZCV is used, do not fold. 444 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 445 return 0; 446 // fall-through to ADDXri and ADDWri. 447 LLVM_FALLTHROUGH; 448 case AArch64::ADDXri: 449 case AArch64::ADDWri: 450 // add x, 1 -> csinc. 451 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 452 DefMI->getOperand(3).getImm() != 0) 453 return 0; 454 SrcOpNum = 1; 455 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 456 break; 457 458 case AArch64::ORNXrr: 459 case AArch64::ORNWrr: { 460 // not x -> csinv, represented as orn dst, xzr, src. 461 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 462 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 463 return 0; 464 SrcOpNum = 2; 465 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 466 break; 467 } 468 469 case AArch64::SUBSXrr: 470 case AArch64::SUBSWrr: 471 // if NZCV is used, do not fold. 472 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 473 return 0; 474 // fall-through to SUBXrr and SUBWrr. 475 LLVM_FALLTHROUGH; 476 case AArch64::SUBXrr: 477 case AArch64::SUBWrr: { 478 // neg x -> csneg, represented as sub dst, xzr, src. 479 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 480 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 481 return 0; 482 SrcOpNum = 2; 483 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 484 break; 485 } 486 default: 487 return 0; 488 } 489 assert(Opc && SrcOpNum && "Missing parameters"); 490 491 if (NewVReg) 492 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 493 return Opc; 494 } 495 496 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 497 ArrayRef<MachineOperand> Cond, 498 unsigned TrueReg, unsigned FalseReg, 499 int &CondCycles, int &TrueCycles, 500 int &FalseCycles) const { 501 // Check register classes. 502 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 503 const TargetRegisterClass *RC = 504 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 505 if (!RC) 506 return false; 507 508 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 509 unsigned ExtraCondLat = Cond.size() != 1; 510 511 // GPRs are handled by csel. 512 // FIXME: Fold in x+1, -x, and ~x when applicable. 513 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 514 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 515 // Single-cycle csel, csinc, csinv, and csneg. 516 CondCycles = 1 + ExtraCondLat; 517 TrueCycles = FalseCycles = 1; 518 if (canFoldIntoCSel(MRI, TrueReg)) 519 TrueCycles = 0; 520 else if (canFoldIntoCSel(MRI, FalseReg)) 521 FalseCycles = 0; 522 return true; 523 } 524 525 // Scalar floating point is handled by fcsel. 526 // FIXME: Form fabs, fmin, and fmax when applicable. 527 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 528 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 529 CondCycles = 5 + ExtraCondLat; 530 TrueCycles = FalseCycles = 2; 531 return true; 532 } 533 534 // Can't do vectors. 535 return false; 536 } 537 538 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 539 MachineBasicBlock::iterator I, 540 const DebugLoc &DL, unsigned DstReg, 541 ArrayRef<MachineOperand> Cond, 542 unsigned TrueReg, unsigned FalseReg) const { 543 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 544 545 // Parse the condition code, see parseCondBranch() above. 546 AArch64CC::CondCode CC; 547 switch (Cond.size()) { 548 default: 549 llvm_unreachable("Unknown condition opcode in Cond"); 550 case 1: // b.cc 551 CC = AArch64CC::CondCode(Cond[0].getImm()); 552 break; 553 case 3: { // cbz/cbnz 554 // We must insert a compare against 0. 555 bool Is64Bit; 556 switch (Cond[1].getImm()) { 557 default: 558 llvm_unreachable("Unknown branch opcode in Cond"); 559 case AArch64::CBZW: 560 Is64Bit = false; 561 CC = AArch64CC::EQ; 562 break; 563 case AArch64::CBZX: 564 Is64Bit = true; 565 CC = AArch64CC::EQ; 566 break; 567 case AArch64::CBNZW: 568 Is64Bit = false; 569 CC = AArch64CC::NE; 570 break; 571 case AArch64::CBNZX: 572 Is64Bit = true; 573 CC = AArch64CC::NE; 574 break; 575 } 576 Register SrcReg = Cond[2].getReg(); 577 if (Is64Bit) { 578 // cmp reg, #0 is actually subs xzr, reg, #0. 579 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 580 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 581 .addReg(SrcReg) 582 .addImm(0) 583 .addImm(0); 584 } else { 585 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 586 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 587 .addReg(SrcReg) 588 .addImm(0) 589 .addImm(0); 590 } 591 break; 592 } 593 case 4: { // tbz/tbnz 594 // We must insert a tst instruction. 595 switch (Cond[1].getImm()) { 596 default: 597 llvm_unreachable("Unknown branch opcode in Cond"); 598 case AArch64::TBZW: 599 case AArch64::TBZX: 600 CC = AArch64CC::EQ; 601 break; 602 case AArch64::TBNZW: 603 case AArch64::TBNZX: 604 CC = AArch64CC::NE; 605 break; 606 } 607 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 608 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 609 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 610 .addReg(Cond[2].getReg()) 611 .addImm( 612 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 613 else 614 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 615 .addReg(Cond[2].getReg()) 616 .addImm( 617 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 618 break; 619 } 620 } 621 622 unsigned Opc = 0; 623 const TargetRegisterClass *RC = nullptr; 624 bool TryFold = false; 625 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 626 RC = &AArch64::GPR64RegClass; 627 Opc = AArch64::CSELXr; 628 TryFold = true; 629 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 630 RC = &AArch64::GPR32RegClass; 631 Opc = AArch64::CSELWr; 632 TryFold = true; 633 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 634 RC = &AArch64::FPR64RegClass; 635 Opc = AArch64::FCSELDrrr; 636 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 637 RC = &AArch64::FPR32RegClass; 638 Opc = AArch64::FCSELSrrr; 639 } 640 assert(RC && "Unsupported regclass"); 641 642 // Try folding simple instructions into the csel. 643 if (TryFold) { 644 unsigned NewVReg = 0; 645 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 646 if (FoldedOpc) { 647 // The folded opcodes csinc, csinc and csneg apply the operation to 648 // FalseReg, so we need to invert the condition. 649 CC = AArch64CC::getInvertedCondCode(CC); 650 TrueReg = FalseReg; 651 } else 652 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 653 654 // Fold the operation. Leave any dead instructions for DCE to clean up. 655 if (FoldedOpc) { 656 FalseReg = NewVReg; 657 Opc = FoldedOpc; 658 // The extends the live range of NewVReg. 659 MRI.clearKillFlags(NewVReg); 660 } 661 } 662 663 // Pull all virtual register into the appropriate class. 664 MRI.constrainRegClass(TrueReg, RC); 665 MRI.constrainRegClass(FalseReg, RC); 666 667 // Insert the csel. 668 BuildMI(MBB, I, DL, get(Opc), DstReg) 669 .addReg(TrueReg) 670 .addReg(FalseReg) 671 .addImm(CC); 672 } 673 674 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 675 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 676 uint64_t Imm = MI.getOperand(1).getImm(); 677 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 678 uint64_t Encoding; 679 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 680 } 681 682 // FIXME: this implementation should be micro-architecture dependent, so a 683 // micro-architecture target hook should be introduced here in future. 684 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 685 if (!Subtarget.hasCustomCheapAsMoveHandling()) 686 return MI.isAsCheapAsAMove(); 687 688 const unsigned Opcode = MI.getOpcode(); 689 690 // Firstly, check cases gated by features. 691 692 if (Subtarget.hasZeroCycleZeroingFP()) { 693 if (Opcode == AArch64::FMOVH0 || 694 Opcode == AArch64::FMOVS0 || 695 Opcode == AArch64::FMOVD0) 696 return true; 697 } 698 699 if (Subtarget.hasZeroCycleZeroingGP()) { 700 if (Opcode == TargetOpcode::COPY && 701 (MI.getOperand(1).getReg() == AArch64::WZR || 702 MI.getOperand(1).getReg() == AArch64::XZR)) 703 return true; 704 } 705 706 // Secondly, check cases specific to sub-targets. 707 708 if (Subtarget.hasExynosCheapAsMoveHandling()) { 709 if (isExynosCheapAsMove(MI)) 710 return true; 711 712 return MI.isAsCheapAsAMove(); 713 } 714 715 // Finally, check generic cases. 716 717 switch (Opcode) { 718 default: 719 return false; 720 721 // add/sub on register without shift 722 case AArch64::ADDWri: 723 case AArch64::ADDXri: 724 case AArch64::SUBWri: 725 case AArch64::SUBXri: 726 return (MI.getOperand(3).getImm() == 0); 727 728 // logical ops on immediate 729 case AArch64::ANDWri: 730 case AArch64::ANDXri: 731 case AArch64::EORWri: 732 case AArch64::EORXri: 733 case AArch64::ORRWri: 734 case AArch64::ORRXri: 735 return true; 736 737 // logical ops on register without shift 738 case AArch64::ANDWrr: 739 case AArch64::ANDXrr: 740 case AArch64::BICWrr: 741 case AArch64::BICXrr: 742 case AArch64::EONWrr: 743 case AArch64::EONXrr: 744 case AArch64::EORWrr: 745 case AArch64::EORXrr: 746 case AArch64::ORNWrr: 747 case AArch64::ORNXrr: 748 case AArch64::ORRWrr: 749 case AArch64::ORRXrr: 750 return true; 751 752 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 753 // ORRXri, it is as cheap as MOV 754 case AArch64::MOVi32imm: 755 return canBeExpandedToORR(MI, 32); 756 case AArch64::MOVi64imm: 757 return canBeExpandedToORR(MI, 64); 758 } 759 760 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 761 } 762 763 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 764 switch (MI.getOpcode()) { 765 default: 766 return false; 767 768 case AArch64::ADDWrs: 769 case AArch64::ADDXrs: 770 case AArch64::ADDSWrs: 771 case AArch64::ADDSXrs: { 772 unsigned Imm = MI.getOperand(3).getImm(); 773 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 774 if (ShiftVal == 0) 775 return true; 776 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 777 } 778 779 case AArch64::ADDWrx: 780 case AArch64::ADDXrx: 781 case AArch64::ADDXrx64: 782 case AArch64::ADDSWrx: 783 case AArch64::ADDSXrx: 784 case AArch64::ADDSXrx64: { 785 unsigned Imm = MI.getOperand(3).getImm(); 786 switch (AArch64_AM::getArithExtendType(Imm)) { 787 default: 788 return false; 789 case AArch64_AM::UXTB: 790 case AArch64_AM::UXTH: 791 case AArch64_AM::UXTW: 792 case AArch64_AM::UXTX: 793 return AArch64_AM::getArithShiftValue(Imm) <= 4; 794 } 795 } 796 797 case AArch64::SUBWrs: 798 case AArch64::SUBSWrs: { 799 unsigned Imm = MI.getOperand(3).getImm(); 800 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 801 return ShiftVal == 0 || 802 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 803 } 804 805 case AArch64::SUBXrs: 806 case AArch64::SUBSXrs: { 807 unsigned Imm = MI.getOperand(3).getImm(); 808 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 809 return ShiftVal == 0 || 810 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 811 } 812 813 case AArch64::SUBWrx: 814 case AArch64::SUBXrx: 815 case AArch64::SUBXrx64: 816 case AArch64::SUBSWrx: 817 case AArch64::SUBSXrx: 818 case AArch64::SUBSXrx64: { 819 unsigned Imm = MI.getOperand(3).getImm(); 820 switch (AArch64_AM::getArithExtendType(Imm)) { 821 default: 822 return false; 823 case AArch64_AM::UXTB: 824 case AArch64_AM::UXTH: 825 case AArch64_AM::UXTW: 826 case AArch64_AM::UXTX: 827 return AArch64_AM::getArithShiftValue(Imm) == 0; 828 } 829 } 830 831 case AArch64::LDRBBroW: 832 case AArch64::LDRBBroX: 833 case AArch64::LDRBroW: 834 case AArch64::LDRBroX: 835 case AArch64::LDRDroW: 836 case AArch64::LDRDroX: 837 case AArch64::LDRHHroW: 838 case AArch64::LDRHHroX: 839 case AArch64::LDRHroW: 840 case AArch64::LDRHroX: 841 case AArch64::LDRQroW: 842 case AArch64::LDRQroX: 843 case AArch64::LDRSBWroW: 844 case AArch64::LDRSBWroX: 845 case AArch64::LDRSBXroW: 846 case AArch64::LDRSBXroX: 847 case AArch64::LDRSHWroW: 848 case AArch64::LDRSHWroX: 849 case AArch64::LDRSHXroW: 850 case AArch64::LDRSHXroX: 851 case AArch64::LDRSWroW: 852 case AArch64::LDRSWroX: 853 case AArch64::LDRSroW: 854 case AArch64::LDRSroX: 855 case AArch64::LDRWroW: 856 case AArch64::LDRWroX: 857 case AArch64::LDRXroW: 858 case AArch64::LDRXroX: 859 case AArch64::PRFMroW: 860 case AArch64::PRFMroX: 861 case AArch64::STRBBroW: 862 case AArch64::STRBBroX: 863 case AArch64::STRBroW: 864 case AArch64::STRBroX: 865 case AArch64::STRDroW: 866 case AArch64::STRDroX: 867 case AArch64::STRHHroW: 868 case AArch64::STRHHroX: 869 case AArch64::STRHroW: 870 case AArch64::STRHroX: 871 case AArch64::STRQroW: 872 case AArch64::STRQroX: 873 case AArch64::STRSroW: 874 case AArch64::STRSroX: 875 case AArch64::STRWroW: 876 case AArch64::STRWroX: 877 case AArch64::STRXroW: 878 case AArch64::STRXroX: { 879 unsigned IsSigned = MI.getOperand(3).getImm(); 880 return !IsSigned; 881 } 882 } 883 } 884 885 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 886 unsigned Opc = MI.getOpcode(); 887 switch (Opc) { 888 default: 889 return false; 890 case AArch64::SEH_StackAlloc: 891 case AArch64::SEH_SaveFPLR: 892 case AArch64::SEH_SaveFPLR_X: 893 case AArch64::SEH_SaveReg: 894 case AArch64::SEH_SaveReg_X: 895 case AArch64::SEH_SaveRegP: 896 case AArch64::SEH_SaveRegP_X: 897 case AArch64::SEH_SaveFReg: 898 case AArch64::SEH_SaveFReg_X: 899 case AArch64::SEH_SaveFRegP: 900 case AArch64::SEH_SaveFRegP_X: 901 case AArch64::SEH_SetFP: 902 case AArch64::SEH_AddFP: 903 case AArch64::SEH_Nop: 904 case AArch64::SEH_PrologEnd: 905 case AArch64::SEH_EpilogStart: 906 case AArch64::SEH_EpilogEnd: 907 return true; 908 } 909 } 910 911 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 912 unsigned &SrcReg, unsigned &DstReg, 913 unsigned &SubIdx) const { 914 switch (MI.getOpcode()) { 915 default: 916 return false; 917 case AArch64::SBFMXri: // aka sxtw 918 case AArch64::UBFMXri: // aka uxtw 919 // Check for the 32 -> 64 bit extension case, these instructions can do 920 // much more. 921 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 922 return false; 923 // This is a signed or unsigned 32 -> 64 bit extension. 924 SrcReg = MI.getOperand(1).getReg(); 925 DstReg = MI.getOperand(0).getReg(); 926 SubIdx = AArch64::sub_32; 927 return true; 928 } 929 } 930 931 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 932 const MachineInstr &MIa, const MachineInstr &MIb) const { 933 const TargetRegisterInfo *TRI = &getRegisterInfo(); 934 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 935 int64_t OffsetA = 0, OffsetB = 0; 936 unsigned WidthA = 0, WidthB = 0; 937 938 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 939 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 940 941 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 942 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 943 return false; 944 945 // Retrieve the base, offset from the base and width. Width 946 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 947 // base are identical, and the offset of a lower memory access + 948 // the width doesn't overlap the offset of a higher memory access, 949 // then the memory accesses are different. 950 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && 951 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { 952 if (BaseOpA->isIdenticalTo(*BaseOpB)) { 953 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 954 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 955 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 956 if (LowOffset + LowWidth <= HighOffset) 957 return true; 958 } 959 } 960 return false; 961 } 962 963 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 964 const MachineBasicBlock *MBB, 965 const MachineFunction &MF) const { 966 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 967 return true; 968 switch (MI.getOpcode()) { 969 case AArch64::HINT: 970 // CSDB hints are scheduling barriers. 971 if (MI.getOperand(0).getImm() == 0x14) 972 return true; 973 break; 974 case AArch64::DSB: 975 case AArch64::ISB: 976 // DSB and ISB also are scheduling barriers. 977 return true; 978 default:; 979 } 980 return isSEHInstruction(MI); 981 } 982 983 /// analyzeCompare - For a comparison instruction, return the source registers 984 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 985 /// Return true if the comparison instruction can be analyzed. 986 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, 987 unsigned &SrcReg2, int &CmpMask, 988 int &CmpValue) const { 989 // The first operand can be a frame index where we'd normally expect a 990 // register. 991 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 992 if (!MI.getOperand(1).isReg()) 993 return false; 994 995 switch (MI.getOpcode()) { 996 default: 997 break; 998 case AArch64::SUBSWrr: 999 case AArch64::SUBSWrs: 1000 case AArch64::SUBSWrx: 1001 case AArch64::SUBSXrr: 1002 case AArch64::SUBSXrs: 1003 case AArch64::SUBSXrx: 1004 case AArch64::ADDSWrr: 1005 case AArch64::ADDSWrs: 1006 case AArch64::ADDSWrx: 1007 case AArch64::ADDSXrr: 1008 case AArch64::ADDSXrs: 1009 case AArch64::ADDSXrx: 1010 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1011 SrcReg = MI.getOperand(1).getReg(); 1012 SrcReg2 = MI.getOperand(2).getReg(); 1013 CmpMask = ~0; 1014 CmpValue = 0; 1015 return true; 1016 case AArch64::SUBSWri: 1017 case AArch64::ADDSWri: 1018 case AArch64::SUBSXri: 1019 case AArch64::ADDSXri: 1020 SrcReg = MI.getOperand(1).getReg(); 1021 SrcReg2 = 0; 1022 CmpMask = ~0; 1023 // FIXME: In order to convert CmpValue to 0 or 1 1024 CmpValue = MI.getOperand(2).getImm() != 0; 1025 return true; 1026 case AArch64::ANDSWri: 1027 case AArch64::ANDSXri: 1028 // ANDS does not use the same encoding scheme as the others xxxS 1029 // instructions. 1030 SrcReg = MI.getOperand(1).getReg(); 1031 SrcReg2 = 0; 1032 CmpMask = ~0; 1033 // FIXME:The return val type of decodeLogicalImmediate is uint64_t, 1034 // while the type of CmpValue is int. When converting uint64_t to int, 1035 // the high 32 bits of uint64_t will be lost. 1036 // In fact it causes a bug in spec2006-483.xalancbmk 1037 // CmpValue is only used to compare with zero in OptimizeCompareInstr 1038 CmpValue = AArch64_AM::decodeLogicalImmediate( 1039 MI.getOperand(2).getImm(), 1040 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; 1041 return true; 1042 } 1043 1044 return false; 1045 } 1046 1047 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1048 MachineBasicBlock *MBB = Instr.getParent(); 1049 assert(MBB && "Can't get MachineBasicBlock here"); 1050 MachineFunction *MF = MBB->getParent(); 1051 assert(MF && "Can't get MachineFunction here"); 1052 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1053 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1054 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1055 1056 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1057 ++OpIdx) { 1058 MachineOperand &MO = Instr.getOperand(OpIdx); 1059 const TargetRegisterClass *OpRegCstraints = 1060 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1061 1062 // If there's no constraint, there's nothing to do. 1063 if (!OpRegCstraints) 1064 continue; 1065 // If the operand is a frame index, there's nothing to do here. 1066 // A frame index operand will resolve correctly during PEI. 1067 if (MO.isFI()) 1068 continue; 1069 1070 assert(MO.isReg() && 1071 "Operand has register constraints without being a register!"); 1072 1073 Register Reg = MO.getReg(); 1074 if (Register::isPhysicalRegister(Reg)) { 1075 if (!OpRegCstraints->contains(Reg)) 1076 return false; 1077 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1078 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1079 return false; 1080 } 1081 1082 return true; 1083 } 1084 1085 /// Return the opcode that does not set flags when possible - otherwise 1086 /// return the original opcode. The caller is responsible to do the actual 1087 /// substitution and legality checking. 1088 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1089 // Don't convert all compare instructions, because for some the zero register 1090 // encoding becomes the sp register. 1091 bool MIDefinesZeroReg = false; 1092 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1093 MIDefinesZeroReg = true; 1094 1095 switch (MI.getOpcode()) { 1096 default: 1097 return MI.getOpcode(); 1098 case AArch64::ADDSWrr: 1099 return AArch64::ADDWrr; 1100 case AArch64::ADDSWri: 1101 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1102 case AArch64::ADDSWrs: 1103 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1104 case AArch64::ADDSWrx: 1105 return AArch64::ADDWrx; 1106 case AArch64::ADDSXrr: 1107 return AArch64::ADDXrr; 1108 case AArch64::ADDSXri: 1109 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1110 case AArch64::ADDSXrs: 1111 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1112 case AArch64::ADDSXrx: 1113 return AArch64::ADDXrx; 1114 case AArch64::SUBSWrr: 1115 return AArch64::SUBWrr; 1116 case AArch64::SUBSWri: 1117 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1118 case AArch64::SUBSWrs: 1119 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1120 case AArch64::SUBSWrx: 1121 return AArch64::SUBWrx; 1122 case AArch64::SUBSXrr: 1123 return AArch64::SUBXrr; 1124 case AArch64::SUBSXri: 1125 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1126 case AArch64::SUBSXrs: 1127 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1128 case AArch64::SUBSXrx: 1129 return AArch64::SUBXrx; 1130 } 1131 } 1132 1133 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1134 1135 /// True when condition flags are accessed (either by writing or reading) 1136 /// on the instruction trace starting at From and ending at To. 1137 /// 1138 /// Note: If From and To are from different blocks it's assumed CC are accessed 1139 /// on the path. 1140 static bool areCFlagsAccessedBetweenInstrs( 1141 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1142 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1143 // Early exit if To is at the beginning of the BB. 1144 if (To == To->getParent()->begin()) 1145 return true; 1146 1147 // Check whether the instructions are in the same basic block 1148 // If not, assume the condition flags might get modified somewhere. 1149 if (To->getParent() != From->getParent()) 1150 return true; 1151 1152 // From must be above To. 1153 assert(std::find_if(++To.getReverse(), To->getParent()->rend(), 1154 [From](MachineInstr &MI) { 1155 return MI.getIterator() == From; 1156 }) != To->getParent()->rend()); 1157 1158 // We iterate backward starting \p To until we hit \p From. 1159 for (--To; To != From; --To) { 1160 const MachineInstr &Instr = *To; 1161 1162 if (((AccessToCheck & AK_Write) && 1163 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1164 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1165 return true; 1166 } 1167 return false; 1168 } 1169 1170 /// Try to optimize a compare instruction. A compare instruction is an 1171 /// instruction which produces AArch64::NZCV. It can be truly compare 1172 /// instruction 1173 /// when there are no uses of its destination register. 1174 /// 1175 /// The following steps are tried in order: 1176 /// 1. Convert CmpInstr into an unconditional version. 1177 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1178 /// condition code or an instruction which can be converted into such an 1179 /// instruction. 1180 /// Only comparison with zero is supported. 1181 bool AArch64InstrInfo::optimizeCompareInstr( 1182 MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, 1183 int CmpValue, const MachineRegisterInfo *MRI) const { 1184 assert(CmpInstr.getParent()); 1185 assert(MRI); 1186 1187 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1188 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1189 if (DeadNZCVIdx != -1) { 1190 if (CmpInstr.definesRegister(AArch64::WZR) || 1191 CmpInstr.definesRegister(AArch64::XZR)) { 1192 CmpInstr.eraseFromParent(); 1193 return true; 1194 } 1195 unsigned Opc = CmpInstr.getOpcode(); 1196 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1197 if (NewOpc == Opc) 1198 return false; 1199 const MCInstrDesc &MCID = get(NewOpc); 1200 CmpInstr.setDesc(MCID); 1201 CmpInstr.RemoveOperand(DeadNZCVIdx); 1202 bool succeeded = UpdateOperandRegClass(CmpInstr); 1203 (void)succeeded; 1204 assert(succeeded && "Some operands reg class are incompatible!"); 1205 return true; 1206 } 1207 1208 // Continue only if we have a "ri" where immediate is zero. 1209 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare 1210 // function. 1211 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); 1212 if (CmpValue != 0 || SrcReg2 != 0) 1213 return false; 1214 1215 // CmpInstr is a Compare instruction if destination register is not used. 1216 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1217 return false; 1218 1219 return substituteCmpToZero(CmpInstr, SrcReg, MRI); 1220 } 1221 1222 /// Get opcode of S version of Instr. 1223 /// If Instr is S version its opcode is returned. 1224 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1225 /// or we are not interested in it. 1226 static unsigned sForm(MachineInstr &Instr) { 1227 switch (Instr.getOpcode()) { 1228 default: 1229 return AArch64::INSTRUCTION_LIST_END; 1230 1231 case AArch64::ADDSWrr: 1232 case AArch64::ADDSWri: 1233 case AArch64::ADDSXrr: 1234 case AArch64::ADDSXri: 1235 case AArch64::SUBSWrr: 1236 case AArch64::SUBSWri: 1237 case AArch64::SUBSXrr: 1238 case AArch64::SUBSXri: 1239 return Instr.getOpcode(); 1240 1241 case AArch64::ADDWrr: 1242 return AArch64::ADDSWrr; 1243 case AArch64::ADDWri: 1244 return AArch64::ADDSWri; 1245 case AArch64::ADDXrr: 1246 return AArch64::ADDSXrr; 1247 case AArch64::ADDXri: 1248 return AArch64::ADDSXri; 1249 case AArch64::ADCWr: 1250 return AArch64::ADCSWr; 1251 case AArch64::ADCXr: 1252 return AArch64::ADCSXr; 1253 case AArch64::SUBWrr: 1254 return AArch64::SUBSWrr; 1255 case AArch64::SUBWri: 1256 return AArch64::SUBSWri; 1257 case AArch64::SUBXrr: 1258 return AArch64::SUBSXrr; 1259 case AArch64::SUBXri: 1260 return AArch64::SUBSXri; 1261 case AArch64::SBCWr: 1262 return AArch64::SBCSWr; 1263 case AArch64::SBCXr: 1264 return AArch64::SBCSXr; 1265 case AArch64::ANDWri: 1266 return AArch64::ANDSWri; 1267 case AArch64::ANDXri: 1268 return AArch64::ANDSXri; 1269 } 1270 } 1271 1272 /// Check if AArch64::NZCV should be alive in successors of MBB. 1273 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { 1274 for (auto *BB : MBB->successors()) 1275 if (BB->isLiveIn(AArch64::NZCV)) 1276 return true; 1277 return false; 1278 } 1279 1280 namespace { 1281 1282 struct UsedNZCV { 1283 bool N = false; 1284 bool Z = false; 1285 bool C = false; 1286 bool V = false; 1287 1288 UsedNZCV() = default; 1289 1290 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1291 this->N |= UsedFlags.N; 1292 this->Z |= UsedFlags.Z; 1293 this->C |= UsedFlags.C; 1294 this->V |= UsedFlags.V; 1295 return *this; 1296 } 1297 }; 1298 1299 } // end anonymous namespace 1300 1301 /// Find a condition code used by the instruction. 1302 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1303 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1304 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1305 switch (Instr.getOpcode()) { 1306 default: 1307 return AArch64CC::Invalid; 1308 1309 case AArch64::Bcc: { 1310 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1311 assert(Idx >= 2); 1312 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); 1313 } 1314 1315 case AArch64::CSINVWr: 1316 case AArch64::CSINVXr: 1317 case AArch64::CSINCWr: 1318 case AArch64::CSINCXr: 1319 case AArch64::CSELWr: 1320 case AArch64::CSELXr: 1321 case AArch64::CSNEGWr: 1322 case AArch64::CSNEGXr: 1323 case AArch64::FCSELSrrr: 1324 case AArch64::FCSELDrrr: { 1325 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1326 assert(Idx >= 1); 1327 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); 1328 } 1329 } 1330 } 1331 1332 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1333 assert(CC != AArch64CC::Invalid); 1334 UsedNZCV UsedFlags; 1335 switch (CC) { 1336 default: 1337 break; 1338 1339 case AArch64CC::EQ: // Z set 1340 case AArch64CC::NE: // Z clear 1341 UsedFlags.Z = true; 1342 break; 1343 1344 case AArch64CC::HI: // Z clear and C set 1345 case AArch64CC::LS: // Z set or C clear 1346 UsedFlags.Z = true; 1347 LLVM_FALLTHROUGH; 1348 case AArch64CC::HS: // C set 1349 case AArch64CC::LO: // C clear 1350 UsedFlags.C = true; 1351 break; 1352 1353 case AArch64CC::MI: // N set 1354 case AArch64CC::PL: // N clear 1355 UsedFlags.N = true; 1356 break; 1357 1358 case AArch64CC::VS: // V set 1359 case AArch64CC::VC: // V clear 1360 UsedFlags.V = true; 1361 break; 1362 1363 case AArch64CC::GT: // Z clear, N and V the same 1364 case AArch64CC::LE: // Z set, N and V differ 1365 UsedFlags.Z = true; 1366 LLVM_FALLTHROUGH; 1367 case AArch64CC::GE: // N and V the same 1368 case AArch64CC::LT: // N and V differ 1369 UsedFlags.N = true; 1370 UsedFlags.V = true; 1371 break; 1372 } 1373 return UsedFlags; 1374 } 1375 1376 static bool isADDSRegImm(unsigned Opcode) { 1377 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1378 } 1379 1380 static bool isSUBSRegImm(unsigned Opcode) { 1381 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1382 } 1383 1384 /// Check if CmpInstr can be substituted by MI. 1385 /// 1386 /// CmpInstr can be substituted: 1387 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1388 /// - and, MI and CmpInstr are from the same MachineBB 1389 /// - and, condition flags are not alive in successors of the CmpInstr parent 1390 /// - and, if MI opcode is the S form there must be no defs of flags between 1391 /// MI and CmpInstr 1392 /// or if MI opcode is not the S form there must be neither defs of flags 1393 /// nor uses of flags between MI and CmpInstr. 1394 /// - and C/V flags are not used after CmpInstr 1395 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, 1396 const TargetRegisterInfo *TRI) { 1397 assert(MI); 1398 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); 1399 assert(CmpInstr); 1400 1401 const unsigned CmpOpcode = CmpInstr->getOpcode(); 1402 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1403 return false; 1404 1405 if (MI->getParent() != CmpInstr->getParent()) 1406 return false; 1407 1408 if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) 1409 return false; 1410 1411 AccessKind AccessToCheck = AK_Write; 1412 if (sForm(*MI) != MI->getOpcode()) 1413 AccessToCheck = AK_All; 1414 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) 1415 return false; 1416 1417 UsedNZCV NZCVUsedAfterCmp; 1418 for (auto I = std::next(CmpInstr->getIterator()), 1419 E = CmpInstr->getParent()->instr_end(); 1420 I != E; ++I) { 1421 const MachineInstr &Instr = *I; 1422 if (Instr.readsRegister(AArch64::NZCV, TRI)) { 1423 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1424 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1425 return false; 1426 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1427 } 1428 1429 if (Instr.modifiesRegister(AArch64::NZCV, TRI)) 1430 break; 1431 } 1432 1433 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; 1434 } 1435 1436 /// Substitute an instruction comparing to zero with another instruction 1437 /// which produces needed condition flags. 1438 /// 1439 /// Return true on success. 1440 bool AArch64InstrInfo::substituteCmpToZero( 1441 MachineInstr &CmpInstr, unsigned SrcReg, 1442 const MachineRegisterInfo *MRI) const { 1443 assert(MRI); 1444 // Get the unique definition of SrcReg. 1445 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); 1446 if (!MI) 1447 return false; 1448 1449 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1450 1451 unsigned NewOpc = sForm(*MI); 1452 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1453 return false; 1454 1455 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) 1456 return false; 1457 1458 // Update the instruction to set NZCV. 1459 MI->setDesc(get(NewOpc)); 1460 CmpInstr.eraseFromParent(); 1461 bool succeeded = UpdateOperandRegClass(*MI); 1462 (void)succeeded; 1463 assert(succeeded && "Some operands reg class are incompatible!"); 1464 MI->addRegisterDefined(AArch64::NZCV, TRI); 1465 return true; 1466 } 1467 1468 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1469 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1470 MI.getOpcode() != AArch64::CATCHRET) 1471 return false; 1472 1473 MachineBasicBlock &MBB = *MI.getParent(); 1474 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1475 auto TRI = Subtarget.getRegisterInfo(); 1476 DebugLoc DL = MI.getDebugLoc(); 1477 1478 if (MI.getOpcode() == AArch64::CATCHRET) { 1479 // Skip to the first instruction before the epilog. 1480 const TargetInstrInfo *TII = 1481 MBB.getParent()->getSubtarget().getInstrInfo(); 1482 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1483 auto MBBI = MachineBasicBlock::iterator(MI); 1484 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1485 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1486 FirstEpilogSEH != MBB.begin()) 1487 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1488 if (FirstEpilogSEH != MBB.begin()) 1489 FirstEpilogSEH = std::next(FirstEpilogSEH); 1490 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1491 .addReg(AArch64::X0, RegState::Define) 1492 .addMBB(TargetMBB); 1493 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1494 .addReg(AArch64::X0, RegState::Define) 1495 .addReg(AArch64::X0) 1496 .addMBB(TargetMBB) 1497 .addImm(0); 1498 return true; 1499 } 1500 1501 Register Reg = MI.getOperand(0).getReg(); 1502 const GlobalValue *GV = 1503 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1504 const TargetMachine &TM = MBB.getParent()->getTarget(); 1505 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1506 const unsigned char MO_NC = AArch64II::MO_NC; 1507 1508 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1509 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1510 .addGlobalAddress(GV, 0, OpFlags); 1511 if (Subtarget.isTargetILP32()) { 1512 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1513 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1514 .addDef(Reg32, RegState::Dead) 1515 .addUse(Reg, RegState::Kill) 1516 .addImm(0) 1517 .addMemOperand(*MI.memoperands_begin()) 1518 .addDef(Reg, RegState::Implicit); 1519 } else { 1520 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1521 .addReg(Reg, RegState::Kill) 1522 .addImm(0) 1523 .addMemOperand(*MI.memoperands_begin()); 1524 } 1525 } else if (TM.getCodeModel() == CodeModel::Large) { 1526 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1527 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1528 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1529 .addImm(0); 1530 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1531 .addReg(Reg, RegState::Kill) 1532 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 1533 .addImm(16); 1534 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1535 .addReg(Reg, RegState::Kill) 1536 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 1537 .addImm(32); 1538 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1539 .addReg(Reg, RegState::Kill) 1540 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 1541 .addImm(48); 1542 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1543 .addReg(Reg, RegState::Kill) 1544 .addImm(0) 1545 .addMemOperand(*MI.memoperands_begin()); 1546 } else if (TM.getCodeModel() == CodeModel::Tiny) { 1547 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 1548 .addGlobalAddress(GV, 0, OpFlags); 1549 } else { 1550 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 1551 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 1552 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 1553 if (Subtarget.isTargetILP32()) { 1554 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1555 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1556 .addDef(Reg32, RegState::Dead) 1557 .addUse(Reg, RegState::Kill) 1558 .addGlobalAddress(GV, 0, LoFlags) 1559 .addMemOperand(*MI.memoperands_begin()) 1560 .addDef(Reg, RegState::Implicit); 1561 } else { 1562 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1563 .addReg(Reg, RegState::Kill) 1564 .addGlobalAddress(GV, 0, LoFlags) 1565 .addMemOperand(*MI.memoperands_begin()); 1566 } 1567 } 1568 1569 MBB.erase(MI); 1570 1571 return true; 1572 } 1573 1574 // Return true if this instruction simply sets its single destination register 1575 // to zero. This is equivalent to a register rename of the zero-register. 1576 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 1577 switch (MI.getOpcode()) { 1578 default: 1579 break; 1580 case AArch64::MOVZWi: 1581 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 1582 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 1583 assert(MI.getDesc().getNumOperands() == 3 && 1584 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 1585 return true; 1586 } 1587 break; 1588 case AArch64::ANDWri: // and Rd, Rzr, #imm 1589 return MI.getOperand(1).getReg() == AArch64::WZR; 1590 case AArch64::ANDXri: 1591 return MI.getOperand(1).getReg() == AArch64::XZR; 1592 case TargetOpcode::COPY: 1593 return MI.getOperand(1).getReg() == AArch64::WZR; 1594 } 1595 return false; 1596 } 1597 1598 // Return true if this instruction simply renames a general register without 1599 // modifying bits. 1600 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 1601 switch (MI.getOpcode()) { 1602 default: 1603 break; 1604 case TargetOpcode::COPY: { 1605 // GPR32 copies will by lowered to ORRXrs 1606 Register DstReg = MI.getOperand(0).getReg(); 1607 return (AArch64::GPR32RegClass.contains(DstReg) || 1608 AArch64::GPR64RegClass.contains(DstReg)); 1609 } 1610 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 1611 if (MI.getOperand(1).getReg() == AArch64::XZR) { 1612 assert(MI.getDesc().getNumOperands() == 4 && 1613 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 1614 return true; 1615 } 1616 break; 1617 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 1618 if (MI.getOperand(2).getImm() == 0) { 1619 assert(MI.getDesc().getNumOperands() == 4 && 1620 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 1621 return true; 1622 } 1623 break; 1624 } 1625 return false; 1626 } 1627 1628 // Return true if this instruction simply renames a general register without 1629 // modifying bits. 1630 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 1631 switch (MI.getOpcode()) { 1632 default: 1633 break; 1634 case TargetOpcode::COPY: { 1635 // FPR64 copies will by lowered to ORR.16b 1636 Register DstReg = MI.getOperand(0).getReg(); 1637 return (AArch64::FPR64RegClass.contains(DstReg) || 1638 AArch64::FPR128RegClass.contains(DstReg)); 1639 } 1640 case AArch64::ORRv16i8: 1641 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 1642 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 1643 "invalid ORRv16i8 operands"); 1644 return true; 1645 } 1646 break; 1647 } 1648 return false; 1649 } 1650 1651 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 1652 int &FrameIndex) const { 1653 switch (MI.getOpcode()) { 1654 default: 1655 break; 1656 case AArch64::LDRWui: 1657 case AArch64::LDRXui: 1658 case AArch64::LDRBui: 1659 case AArch64::LDRHui: 1660 case AArch64::LDRSui: 1661 case AArch64::LDRDui: 1662 case AArch64::LDRQui: 1663 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1664 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1665 FrameIndex = MI.getOperand(1).getIndex(); 1666 return MI.getOperand(0).getReg(); 1667 } 1668 break; 1669 } 1670 1671 return 0; 1672 } 1673 1674 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 1675 int &FrameIndex) const { 1676 switch (MI.getOpcode()) { 1677 default: 1678 break; 1679 case AArch64::STRWui: 1680 case AArch64::STRXui: 1681 case AArch64::STRBui: 1682 case AArch64::STRHui: 1683 case AArch64::STRSui: 1684 case AArch64::STRDui: 1685 case AArch64::STRQui: 1686 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1687 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1688 FrameIndex = MI.getOperand(1).getIndex(); 1689 return MI.getOperand(0).getReg(); 1690 } 1691 break; 1692 } 1693 return 0; 1694 } 1695 1696 /// Check all MachineMemOperands for a hint to suppress pairing. 1697 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 1698 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1699 return MMO->getFlags() & MOSuppressPair; 1700 }); 1701 } 1702 1703 /// Set a flag on the first MachineMemOperand to suppress pairing. 1704 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 1705 if (MI.memoperands_empty()) 1706 return; 1707 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 1708 } 1709 1710 /// Check all MachineMemOperands for a hint that the load/store is strided. 1711 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 1712 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1713 return MMO->getFlags() & MOStridedAccess; 1714 }); 1715 } 1716 1717 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { 1718 switch (Opc) { 1719 default: 1720 return false; 1721 case AArch64::STURSi: 1722 case AArch64::STURDi: 1723 case AArch64::STURQi: 1724 case AArch64::STURBBi: 1725 case AArch64::STURHHi: 1726 case AArch64::STURWi: 1727 case AArch64::STURXi: 1728 case AArch64::LDURSi: 1729 case AArch64::LDURDi: 1730 case AArch64::LDURQi: 1731 case AArch64::LDURWi: 1732 case AArch64::LDURXi: 1733 case AArch64::LDURSWi: 1734 case AArch64::LDURHHi: 1735 case AArch64::LDURBBi: 1736 case AArch64::LDURSBWi: 1737 case AArch64::LDURSHWi: 1738 return true; 1739 } 1740 } 1741 1742 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 1743 switch (Opc) { 1744 default: return {}; 1745 case AArch64::PRFMui: return AArch64::PRFUMi; 1746 case AArch64::LDRXui: return AArch64::LDURXi; 1747 case AArch64::LDRWui: return AArch64::LDURWi; 1748 case AArch64::LDRBui: return AArch64::LDURBi; 1749 case AArch64::LDRHui: return AArch64::LDURHi; 1750 case AArch64::LDRSui: return AArch64::LDURSi; 1751 case AArch64::LDRDui: return AArch64::LDURDi; 1752 case AArch64::LDRQui: return AArch64::LDURQi; 1753 case AArch64::LDRBBui: return AArch64::LDURBBi; 1754 case AArch64::LDRHHui: return AArch64::LDURHHi; 1755 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 1756 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 1757 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 1758 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 1759 case AArch64::LDRSWui: return AArch64::LDURSWi; 1760 case AArch64::STRXui: return AArch64::STURXi; 1761 case AArch64::STRWui: return AArch64::STURWi; 1762 case AArch64::STRBui: return AArch64::STURBi; 1763 case AArch64::STRHui: return AArch64::STURHi; 1764 case AArch64::STRSui: return AArch64::STURSi; 1765 case AArch64::STRDui: return AArch64::STURDi; 1766 case AArch64::STRQui: return AArch64::STURQi; 1767 case AArch64::STRBBui: return AArch64::STURBBi; 1768 case AArch64::STRHHui: return AArch64::STURHHi; 1769 } 1770 } 1771 1772 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 1773 switch (Opc) { 1774 default: 1775 return 2; 1776 case AArch64::LDPXi: 1777 case AArch64::LDPDi: 1778 case AArch64::STPXi: 1779 case AArch64::STPDi: 1780 case AArch64::LDNPXi: 1781 case AArch64::LDNPDi: 1782 case AArch64::STNPXi: 1783 case AArch64::STNPDi: 1784 case AArch64::LDPQi: 1785 case AArch64::STPQi: 1786 case AArch64::LDNPQi: 1787 case AArch64::STNPQi: 1788 case AArch64::LDPWi: 1789 case AArch64::LDPSi: 1790 case AArch64::STPWi: 1791 case AArch64::STPSi: 1792 case AArch64::LDNPWi: 1793 case AArch64::LDNPSi: 1794 case AArch64::STNPWi: 1795 case AArch64::STNPSi: 1796 case AArch64::LDG: 1797 case AArch64::STGPi: 1798 return 3; 1799 case AArch64::ADDG: 1800 case AArch64::STGOffset: 1801 return 2; 1802 } 1803 } 1804 1805 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 1806 switch (MI.getOpcode()) { 1807 default: 1808 return false; 1809 // Scaled instructions. 1810 case AArch64::STRSui: 1811 case AArch64::STRDui: 1812 case AArch64::STRQui: 1813 case AArch64::STRXui: 1814 case AArch64::STRWui: 1815 case AArch64::LDRSui: 1816 case AArch64::LDRDui: 1817 case AArch64::LDRQui: 1818 case AArch64::LDRXui: 1819 case AArch64::LDRWui: 1820 case AArch64::LDRSWui: 1821 // Unscaled instructions. 1822 case AArch64::STURSi: 1823 case AArch64::STURDi: 1824 case AArch64::STURQi: 1825 case AArch64::STURWi: 1826 case AArch64::STURXi: 1827 case AArch64::LDURSi: 1828 case AArch64::LDURDi: 1829 case AArch64::LDURQi: 1830 case AArch64::LDURWi: 1831 case AArch64::LDURXi: 1832 case AArch64::LDURSWi: 1833 return true; 1834 } 1835 } 1836 1837 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 1838 bool &Is64Bit) { 1839 switch (Opc) { 1840 default: 1841 llvm_unreachable("Opcode has no flag setting equivalent!"); 1842 // 32-bit cases: 1843 case AArch64::ADDWri: 1844 Is64Bit = false; 1845 return AArch64::ADDSWri; 1846 case AArch64::ADDWrr: 1847 Is64Bit = false; 1848 return AArch64::ADDSWrr; 1849 case AArch64::ADDWrs: 1850 Is64Bit = false; 1851 return AArch64::ADDSWrs; 1852 case AArch64::ADDWrx: 1853 Is64Bit = false; 1854 return AArch64::ADDSWrx; 1855 case AArch64::ANDWri: 1856 Is64Bit = false; 1857 return AArch64::ANDSWri; 1858 case AArch64::ANDWrr: 1859 Is64Bit = false; 1860 return AArch64::ANDSWrr; 1861 case AArch64::ANDWrs: 1862 Is64Bit = false; 1863 return AArch64::ANDSWrs; 1864 case AArch64::BICWrr: 1865 Is64Bit = false; 1866 return AArch64::BICSWrr; 1867 case AArch64::BICWrs: 1868 Is64Bit = false; 1869 return AArch64::BICSWrs; 1870 case AArch64::SUBWri: 1871 Is64Bit = false; 1872 return AArch64::SUBSWri; 1873 case AArch64::SUBWrr: 1874 Is64Bit = false; 1875 return AArch64::SUBSWrr; 1876 case AArch64::SUBWrs: 1877 Is64Bit = false; 1878 return AArch64::SUBSWrs; 1879 case AArch64::SUBWrx: 1880 Is64Bit = false; 1881 return AArch64::SUBSWrx; 1882 // 64-bit cases: 1883 case AArch64::ADDXri: 1884 Is64Bit = true; 1885 return AArch64::ADDSXri; 1886 case AArch64::ADDXrr: 1887 Is64Bit = true; 1888 return AArch64::ADDSXrr; 1889 case AArch64::ADDXrs: 1890 Is64Bit = true; 1891 return AArch64::ADDSXrs; 1892 case AArch64::ADDXrx: 1893 Is64Bit = true; 1894 return AArch64::ADDSXrx; 1895 case AArch64::ANDXri: 1896 Is64Bit = true; 1897 return AArch64::ANDSXri; 1898 case AArch64::ANDXrr: 1899 Is64Bit = true; 1900 return AArch64::ANDSXrr; 1901 case AArch64::ANDXrs: 1902 Is64Bit = true; 1903 return AArch64::ANDSXrs; 1904 case AArch64::BICXrr: 1905 Is64Bit = true; 1906 return AArch64::BICSXrr; 1907 case AArch64::BICXrs: 1908 Is64Bit = true; 1909 return AArch64::BICSXrs; 1910 case AArch64::SUBXri: 1911 Is64Bit = true; 1912 return AArch64::SUBSXri; 1913 case AArch64::SUBXrr: 1914 Is64Bit = true; 1915 return AArch64::SUBSXrr; 1916 case AArch64::SUBXrs: 1917 Is64Bit = true; 1918 return AArch64::SUBSXrs; 1919 case AArch64::SUBXrx: 1920 Is64Bit = true; 1921 return AArch64::SUBSXrx; 1922 } 1923 } 1924 1925 // Is this a candidate for ld/st merging or pairing? For example, we don't 1926 // touch volatiles or load/stores that have a hint to avoid pair formation. 1927 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 1928 // If this is a volatile load/store, don't mess with it. 1929 if (MI.hasOrderedMemoryRef()) 1930 return false; 1931 1932 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 1933 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && 1934 "Expected a reg or frame index operand."); 1935 if (!MI.getOperand(2).isImm()) 1936 return false; 1937 1938 // Can't merge/pair if the instruction modifies the base register. 1939 // e.g., ldr x0, [x0] 1940 // This case will never occur with an FI base. 1941 if (MI.getOperand(1).isReg()) { 1942 Register BaseReg = MI.getOperand(1).getReg(); 1943 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1944 if (MI.modifiesRegister(BaseReg, TRI)) 1945 return false; 1946 } 1947 1948 // Check if this load/store has a hint to avoid pair formation. 1949 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1950 if (isLdStPairSuppressed(MI)) 1951 return false; 1952 1953 // Do not pair any callee-save store/reload instructions in the 1954 // prologue/epilogue if the CFI information encoded the operations as separate 1955 // instructions, as that will cause the size of the actual prologue to mismatch 1956 // with the prologue size recorded in the Windows CFI. 1957 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 1958 bool NeedsWinCFI = MAI->usesWindowsCFI() && 1959 MI.getMF()->getFunction().needsUnwindTableEntry(); 1960 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 1961 MI.getFlag(MachineInstr::FrameDestroy))) 1962 return false; 1963 1964 // On some CPUs quad load/store pairs are slower than two single load/stores. 1965 if (Subtarget.isPaired128Slow()) { 1966 switch (MI.getOpcode()) { 1967 default: 1968 break; 1969 case AArch64::LDURQi: 1970 case AArch64::STURQi: 1971 case AArch64::LDRQui: 1972 case AArch64::STRQui: 1973 return false; 1974 } 1975 } 1976 1977 return true; 1978 } 1979 1980 bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, 1981 const MachineOperand *&BaseOp, 1982 int64_t &Offset, 1983 const TargetRegisterInfo *TRI) const { 1984 unsigned Width; 1985 return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); 1986 } 1987 1988 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 1989 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 1990 unsigned &Width, const TargetRegisterInfo *TRI) const { 1991 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 1992 // Handle only loads/stores with base register followed by immediate offset. 1993 if (LdSt.getNumExplicitOperands() == 3) { 1994 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 1995 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 1996 !LdSt.getOperand(2).isImm()) 1997 return false; 1998 } else if (LdSt.getNumExplicitOperands() == 4) { 1999 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2000 if (!LdSt.getOperand(1).isReg() || 2001 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2002 !LdSt.getOperand(3).isImm()) 2003 return false; 2004 } else 2005 return false; 2006 2007 // Get the scaling factor for the instruction and set the width for the 2008 // instruction. 2009 unsigned Scale = 0; 2010 int64_t Dummy1, Dummy2; 2011 2012 // If this returns false, then it's an instruction we don't want to handle. 2013 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2014 return false; 2015 2016 // Compute the offset. Offset is calculated as the immediate operand 2017 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2018 // set to 1. 2019 if (LdSt.getNumExplicitOperands() == 3) { 2020 BaseOp = &LdSt.getOperand(1); 2021 Offset = LdSt.getOperand(2).getImm() * Scale; 2022 } else { 2023 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2024 BaseOp = &LdSt.getOperand(2); 2025 Offset = LdSt.getOperand(3).getImm() * Scale; 2026 } 2027 2028 assert((BaseOp->isReg() || BaseOp->isFI()) && 2029 "getMemOperandWithOffset only supports base " 2030 "operands of type register or frame index."); 2031 2032 return true; 2033 } 2034 2035 MachineOperand & 2036 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2037 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2038 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2039 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2040 return OfsOp; 2041 } 2042 2043 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, 2044 unsigned &Width, int64_t &MinOffset, 2045 int64_t &MaxOffset) { 2046 switch (Opcode) { 2047 // Not a memory operation or something we want to handle. 2048 default: 2049 Scale = Width = 0; 2050 MinOffset = MaxOffset = 0; 2051 return false; 2052 case AArch64::STRWpost: 2053 case AArch64::LDRWpost: 2054 Width = 32; 2055 Scale = 4; 2056 MinOffset = -256; 2057 MaxOffset = 255; 2058 break; 2059 case AArch64::LDURQi: 2060 case AArch64::STURQi: 2061 Width = 16; 2062 Scale = 1; 2063 MinOffset = -256; 2064 MaxOffset = 255; 2065 break; 2066 case AArch64::PRFUMi: 2067 case AArch64::LDURXi: 2068 case AArch64::LDURDi: 2069 case AArch64::STURXi: 2070 case AArch64::STURDi: 2071 Width = 8; 2072 Scale = 1; 2073 MinOffset = -256; 2074 MaxOffset = 255; 2075 break; 2076 case AArch64::LDURWi: 2077 case AArch64::LDURSi: 2078 case AArch64::LDURSWi: 2079 case AArch64::STURWi: 2080 case AArch64::STURSi: 2081 Width = 4; 2082 Scale = 1; 2083 MinOffset = -256; 2084 MaxOffset = 255; 2085 break; 2086 case AArch64::LDURHi: 2087 case AArch64::LDURHHi: 2088 case AArch64::LDURSHXi: 2089 case AArch64::LDURSHWi: 2090 case AArch64::STURHi: 2091 case AArch64::STURHHi: 2092 Width = 2; 2093 Scale = 1; 2094 MinOffset = -256; 2095 MaxOffset = 255; 2096 break; 2097 case AArch64::LDURBi: 2098 case AArch64::LDURBBi: 2099 case AArch64::LDURSBXi: 2100 case AArch64::LDURSBWi: 2101 case AArch64::STURBi: 2102 case AArch64::STURBBi: 2103 Width = 1; 2104 Scale = 1; 2105 MinOffset = -256; 2106 MaxOffset = 255; 2107 break; 2108 case AArch64::LDPQi: 2109 case AArch64::LDNPQi: 2110 case AArch64::STPQi: 2111 case AArch64::STNPQi: 2112 Scale = 16; 2113 Width = 32; 2114 MinOffset = -64; 2115 MaxOffset = 63; 2116 break; 2117 case AArch64::LDRQui: 2118 case AArch64::STRQui: 2119 Scale = Width = 16; 2120 MinOffset = 0; 2121 MaxOffset = 4095; 2122 break; 2123 case AArch64::LDPXi: 2124 case AArch64::LDPDi: 2125 case AArch64::LDNPXi: 2126 case AArch64::LDNPDi: 2127 case AArch64::STPXi: 2128 case AArch64::STPDi: 2129 case AArch64::STNPXi: 2130 case AArch64::STNPDi: 2131 Scale = 8; 2132 Width = 16; 2133 MinOffset = -64; 2134 MaxOffset = 63; 2135 break; 2136 case AArch64::PRFMui: 2137 case AArch64::LDRXui: 2138 case AArch64::LDRDui: 2139 case AArch64::STRXui: 2140 case AArch64::STRDui: 2141 Scale = Width = 8; 2142 MinOffset = 0; 2143 MaxOffset = 4095; 2144 break; 2145 case AArch64::LDPWi: 2146 case AArch64::LDPSi: 2147 case AArch64::LDNPWi: 2148 case AArch64::LDNPSi: 2149 case AArch64::STPWi: 2150 case AArch64::STPSi: 2151 case AArch64::STNPWi: 2152 case AArch64::STNPSi: 2153 Scale = 4; 2154 Width = 8; 2155 MinOffset = -64; 2156 MaxOffset = 63; 2157 break; 2158 case AArch64::LDRWui: 2159 case AArch64::LDRSui: 2160 case AArch64::LDRSWui: 2161 case AArch64::STRWui: 2162 case AArch64::STRSui: 2163 Scale = Width = 4; 2164 MinOffset = 0; 2165 MaxOffset = 4095; 2166 break; 2167 case AArch64::LDRHui: 2168 case AArch64::LDRHHui: 2169 case AArch64::LDRSHWui: 2170 case AArch64::LDRSHXui: 2171 case AArch64::STRHui: 2172 case AArch64::STRHHui: 2173 Scale = Width = 2; 2174 MinOffset = 0; 2175 MaxOffset = 4095; 2176 break; 2177 case AArch64::LDRBui: 2178 case AArch64::LDRBBui: 2179 case AArch64::LDRSBWui: 2180 case AArch64::LDRSBXui: 2181 case AArch64::STRBui: 2182 case AArch64::STRBBui: 2183 Scale = Width = 1; 2184 MinOffset = 0; 2185 MaxOffset = 4095; 2186 break; 2187 case AArch64::ADDG: 2188 case AArch64::TAGPstack: 2189 Scale = 16; 2190 Width = 0; 2191 MinOffset = 0; 2192 MaxOffset = 63; 2193 break; 2194 case AArch64::LDG: 2195 case AArch64::STGOffset: 2196 case AArch64::STZGOffset: 2197 Scale = Width = 16; 2198 MinOffset = -256; 2199 MaxOffset = 255; 2200 break; 2201 case AArch64::LDR_PXI: 2202 case AArch64::STR_PXI: 2203 Scale = Width = 2; 2204 MinOffset = -256; 2205 MaxOffset = 255; 2206 break; 2207 case AArch64::LDR_ZXI: 2208 case AArch64::STR_ZXI: 2209 Scale = Width = 16; 2210 MinOffset = -256; 2211 MaxOffset = 255; 2212 break; 2213 case AArch64::ST2GOffset: 2214 case AArch64::STZ2GOffset: 2215 Scale = 16; 2216 Width = 32; 2217 MinOffset = -256; 2218 MaxOffset = 255; 2219 break; 2220 case AArch64::STGPi: 2221 Scale = Width = 16; 2222 MinOffset = -64; 2223 MaxOffset = 63; 2224 break; 2225 } 2226 2227 return true; 2228 } 2229 2230 static unsigned getOffsetStride(unsigned Opc) { 2231 switch (Opc) { 2232 default: 2233 return 0; 2234 case AArch64::LDURQi: 2235 case AArch64::STURQi: 2236 return 16; 2237 case AArch64::LDURXi: 2238 case AArch64::LDURDi: 2239 case AArch64::STURXi: 2240 case AArch64::STURDi: 2241 return 8; 2242 case AArch64::LDURWi: 2243 case AArch64::LDURSi: 2244 case AArch64::LDURSWi: 2245 case AArch64::STURWi: 2246 case AArch64::STURSi: 2247 return 4; 2248 } 2249 } 2250 2251 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 2252 // scaled. 2253 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 2254 unsigned OffsetStride = getOffsetStride(Opc); 2255 if (OffsetStride == 0) 2256 return false; 2257 // If the byte-offset isn't a multiple of the stride, we can't scale this 2258 // offset. 2259 if (Offset % OffsetStride != 0) 2260 return false; 2261 2262 // Convert the byte-offset used by unscaled into an "element" offset used 2263 // by the scaled pair load/store instructions. 2264 Offset /= OffsetStride; 2265 return true; 2266 } 2267 2268 // Unscale the scaled offsets. Returns false if the scaled offset can't be 2269 // unscaled. 2270 static bool unscaleOffset(unsigned Opc, int64_t &Offset) { 2271 unsigned OffsetStride = getOffsetStride(Opc); 2272 if (OffsetStride == 0) 2273 return false; 2274 2275 // Convert the "element" offset used by scaled pair load/store instructions 2276 // into the byte-offset used by unscaled. 2277 Offset *= OffsetStride; 2278 return true; 2279 } 2280 2281 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 2282 if (FirstOpc == SecondOpc) 2283 return true; 2284 // We can also pair sign-ext and zero-ext instructions. 2285 switch (FirstOpc) { 2286 default: 2287 return false; 2288 case AArch64::LDRWui: 2289 case AArch64::LDURWi: 2290 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 2291 case AArch64::LDRSWui: 2292 case AArch64::LDURSWi: 2293 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 2294 } 2295 // These instructions can't be paired based on their opcodes. 2296 return false; 2297 } 2298 2299 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 2300 int64_t Offset1, unsigned Opcode1, int FI2, 2301 int64_t Offset2, unsigned Opcode2) { 2302 // Accesses through fixed stack object frame indices may access a different 2303 // fixed stack slot. Check that the object offsets + offsets match. 2304 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 2305 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 2306 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 2307 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 2308 // Get the byte-offset from the object offset. 2309 if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2)) 2310 return false; 2311 ObjectOffset1 += Offset1; 2312 ObjectOffset2 += Offset2; 2313 // Get the "element" index in the object. 2314 if (!scaleOffset(Opcode1, ObjectOffset1) || 2315 !scaleOffset(Opcode2, ObjectOffset2)) 2316 return false; 2317 return ObjectOffset1 + 1 == ObjectOffset2; 2318 } 2319 2320 return FI1 == FI2; 2321 } 2322 2323 /// Detect opportunities for ldp/stp formation. 2324 /// 2325 /// Only called for LdSt for which getMemOperandWithOffset returns true. 2326 bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, 2327 const MachineOperand &BaseOp2, 2328 unsigned NumLoads) const { 2329 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 2330 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 2331 if (BaseOp1.getType() != BaseOp2.getType()) 2332 return false; 2333 2334 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 2335 "Only base registers and frame indices are supported."); 2336 2337 // Check for both base regs and base FI. 2338 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 2339 return false; 2340 2341 // Only cluster up to a single pair. 2342 if (NumLoads > 1) 2343 return false; 2344 2345 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 2346 return false; 2347 2348 // Can we pair these instructions based on their opcodes? 2349 unsigned FirstOpc = FirstLdSt.getOpcode(); 2350 unsigned SecondOpc = SecondLdSt.getOpcode(); 2351 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 2352 return false; 2353 2354 // Can't merge volatiles or load/stores that have a hint to avoid pair 2355 // formation, for example. 2356 if (!isCandidateToMergeOrPair(FirstLdSt) || 2357 !isCandidateToMergeOrPair(SecondLdSt)) 2358 return false; 2359 2360 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 2361 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 2362 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 2363 return false; 2364 2365 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 2366 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 2367 return false; 2368 2369 // Pairwise instructions have a 7-bit signed offset field. 2370 if (Offset1 > 63 || Offset1 < -64) 2371 return false; 2372 2373 // The caller should already have ordered First/SecondLdSt by offset. 2374 // Note: except for non-equal frame index bases 2375 if (BaseOp1.isFI()) { 2376 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) && 2377 "Caller should have ordered offsets."); 2378 2379 const MachineFrameInfo &MFI = 2380 FirstLdSt.getParent()->getParent()->getFrameInfo(); 2381 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 2382 BaseOp2.getIndex(), Offset2, SecondOpc); 2383 } 2384 2385 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 2386 "Caller should have ordered offsets."); 2387 2388 return Offset1 + 1 == Offset2; 2389 } 2390 2391 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 2392 unsigned Reg, unsigned SubIdx, 2393 unsigned State, 2394 const TargetRegisterInfo *TRI) { 2395 if (!SubIdx) 2396 return MIB.addReg(Reg, State); 2397 2398 if (Register::isPhysicalRegister(Reg)) 2399 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 2400 return MIB.addReg(Reg, State, SubIdx); 2401 } 2402 2403 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 2404 unsigned NumRegs) { 2405 // We really want the positive remainder mod 32 here, that happens to be 2406 // easily obtainable with a mask. 2407 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 2408 } 2409 2410 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 2411 MachineBasicBlock::iterator I, 2412 const DebugLoc &DL, unsigned DestReg, 2413 unsigned SrcReg, bool KillSrc, 2414 unsigned Opcode, 2415 ArrayRef<unsigned> Indices) const { 2416 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 2417 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2418 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2419 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2420 unsigned NumRegs = Indices.size(); 2421 2422 int SubReg = 0, End = NumRegs, Incr = 1; 2423 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 2424 SubReg = NumRegs - 1; 2425 End = -1; 2426 Incr = -1; 2427 } 2428 2429 for (; SubReg != End; SubReg += Incr) { 2430 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2431 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2432 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 2433 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2434 } 2435 } 2436 2437 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 2438 MachineBasicBlock::iterator I, 2439 DebugLoc DL, unsigned DestReg, 2440 unsigned SrcReg, bool KillSrc, 2441 unsigned Opcode, unsigned ZeroReg, 2442 llvm::ArrayRef<unsigned> Indices) const { 2443 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2444 unsigned NumRegs = Indices.size(); 2445 2446 #ifndef NDEBUG 2447 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2448 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2449 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 2450 "GPR reg sequences should not be able to overlap"); 2451 #endif 2452 2453 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 2454 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2455 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2456 MIB.addReg(ZeroReg); 2457 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2458 MIB.addImm(0); 2459 } 2460 } 2461 2462 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 2463 MachineBasicBlock::iterator I, 2464 const DebugLoc &DL, unsigned DestReg, 2465 unsigned SrcReg, bool KillSrc) const { 2466 if (AArch64::GPR32spRegClass.contains(DestReg) && 2467 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 2468 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2469 2470 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 2471 // If either operand is WSP, expand to ADD #0. 2472 if (Subtarget.hasZeroCycleRegMove()) { 2473 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 2474 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, 2475 &AArch64::GPR64spRegClass); 2476 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, 2477 &AArch64::GPR64spRegClass); 2478 // This instruction is reading and writing X registers. This may upset 2479 // the register scavenger and machine verifier, so we need to indicate 2480 // that we are reading an undefined value from SrcRegX, but a proper 2481 // value from SrcReg. 2482 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 2483 .addReg(SrcRegX, RegState::Undef) 2484 .addImm(0) 2485 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 2486 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2487 } else { 2488 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 2489 .addReg(SrcReg, getKillRegState(KillSrc)) 2490 .addImm(0) 2491 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2492 } 2493 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 2494 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 2495 .addImm(0) 2496 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2497 } else { 2498 if (Subtarget.hasZeroCycleRegMove()) { 2499 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 2500 unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, 2501 &AArch64::GPR64spRegClass); 2502 unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, 2503 &AArch64::GPR64spRegClass); 2504 // This instruction is reading and writing X registers. This may upset 2505 // the register scavenger and machine verifier, so we need to indicate 2506 // that we are reading an undefined value from SrcRegX, but a proper 2507 // value from SrcReg. 2508 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 2509 .addReg(AArch64::XZR) 2510 .addReg(SrcRegX, RegState::Undef) 2511 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2512 } else { 2513 // Otherwise, expand to ORR WZR. 2514 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 2515 .addReg(AArch64::WZR) 2516 .addReg(SrcReg, getKillRegState(KillSrc)); 2517 } 2518 } 2519 return; 2520 } 2521 2522 // Copy a Predicate register by ORRing with itself. 2523 if (AArch64::PPRRegClass.contains(DestReg) && 2524 AArch64::PPRRegClass.contains(SrcReg)) { 2525 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2526 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 2527 .addReg(SrcReg) // Pg 2528 .addReg(SrcReg) 2529 .addReg(SrcReg, getKillRegState(KillSrc)); 2530 return; 2531 } 2532 2533 // Copy a Z register by ORRing with itself. 2534 if (AArch64::ZPRRegClass.contains(DestReg) && 2535 AArch64::ZPRRegClass.contains(SrcReg)) { 2536 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2537 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 2538 .addReg(SrcReg) 2539 .addReg(SrcReg, getKillRegState(KillSrc)); 2540 return; 2541 } 2542 2543 if (AArch64::GPR64spRegClass.contains(DestReg) && 2544 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 2545 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 2546 // If either operand is SP, expand to ADD #0. 2547 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 2548 .addReg(SrcReg, getKillRegState(KillSrc)) 2549 .addImm(0) 2550 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2551 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 2552 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 2553 .addImm(0) 2554 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2555 } else { 2556 // Otherwise, expand to ORR XZR. 2557 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 2558 .addReg(AArch64::XZR) 2559 .addReg(SrcReg, getKillRegState(KillSrc)); 2560 } 2561 return; 2562 } 2563 2564 // Copy a DDDD register quad by copying the individual sub-registers. 2565 if (AArch64::DDDDRegClass.contains(DestReg) && 2566 AArch64::DDDDRegClass.contains(SrcReg)) { 2567 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2568 AArch64::dsub2, AArch64::dsub3}; 2569 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2570 Indices); 2571 return; 2572 } 2573 2574 // Copy a DDD register triple by copying the individual sub-registers. 2575 if (AArch64::DDDRegClass.contains(DestReg) && 2576 AArch64::DDDRegClass.contains(SrcReg)) { 2577 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2578 AArch64::dsub2}; 2579 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2580 Indices); 2581 return; 2582 } 2583 2584 // Copy a DD register pair by copying the individual sub-registers. 2585 if (AArch64::DDRegClass.contains(DestReg) && 2586 AArch64::DDRegClass.contains(SrcReg)) { 2587 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 2588 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2589 Indices); 2590 return; 2591 } 2592 2593 // Copy a QQQQ register quad by copying the individual sub-registers. 2594 if (AArch64::QQQQRegClass.contains(DestReg) && 2595 AArch64::QQQQRegClass.contains(SrcReg)) { 2596 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2597 AArch64::qsub2, AArch64::qsub3}; 2598 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2599 Indices); 2600 return; 2601 } 2602 2603 // Copy a QQQ register triple by copying the individual sub-registers. 2604 if (AArch64::QQQRegClass.contains(DestReg) && 2605 AArch64::QQQRegClass.contains(SrcReg)) { 2606 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2607 AArch64::qsub2}; 2608 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2609 Indices); 2610 return; 2611 } 2612 2613 // Copy a QQ register pair by copying the individual sub-registers. 2614 if (AArch64::QQRegClass.contains(DestReg) && 2615 AArch64::QQRegClass.contains(SrcReg)) { 2616 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 2617 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2618 Indices); 2619 return; 2620 } 2621 2622 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 2623 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 2624 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 2625 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 2626 AArch64::XZR, Indices); 2627 return; 2628 } 2629 2630 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 2631 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 2632 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 2633 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 2634 AArch64::WZR, Indices); 2635 return; 2636 } 2637 2638 if (AArch64::FPR128RegClass.contains(DestReg) && 2639 AArch64::FPR128RegClass.contains(SrcReg)) { 2640 if (Subtarget.hasNEON()) { 2641 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2642 .addReg(SrcReg) 2643 .addReg(SrcReg, getKillRegState(KillSrc)); 2644 } else { 2645 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 2646 .addReg(AArch64::SP, RegState::Define) 2647 .addReg(SrcReg, getKillRegState(KillSrc)) 2648 .addReg(AArch64::SP) 2649 .addImm(-16); 2650 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 2651 .addReg(AArch64::SP, RegState::Define) 2652 .addReg(DestReg, RegState::Define) 2653 .addReg(AArch64::SP) 2654 .addImm(16); 2655 } 2656 return; 2657 } 2658 2659 if (AArch64::FPR64RegClass.contains(DestReg) && 2660 AArch64::FPR64RegClass.contains(SrcReg)) { 2661 if (Subtarget.hasNEON()) { 2662 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 2663 &AArch64::FPR128RegClass); 2664 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 2665 &AArch64::FPR128RegClass); 2666 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2667 .addReg(SrcReg) 2668 .addReg(SrcReg, getKillRegState(KillSrc)); 2669 } else { 2670 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 2671 .addReg(SrcReg, getKillRegState(KillSrc)); 2672 } 2673 return; 2674 } 2675 2676 if (AArch64::FPR32RegClass.contains(DestReg) && 2677 AArch64::FPR32RegClass.contains(SrcReg)) { 2678 if (Subtarget.hasNEON()) { 2679 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 2680 &AArch64::FPR128RegClass); 2681 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 2682 &AArch64::FPR128RegClass); 2683 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2684 .addReg(SrcReg) 2685 .addReg(SrcReg, getKillRegState(KillSrc)); 2686 } else { 2687 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2688 .addReg(SrcReg, getKillRegState(KillSrc)); 2689 } 2690 return; 2691 } 2692 2693 if (AArch64::FPR16RegClass.contains(DestReg) && 2694 AArch64::FPR16RegClass.contains(SrcReg)) { 2695 if (Subtarget.hasNEON()) { 2696 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2697 &AArch64::FPR128RegClass); 2698 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2699 &AArch64::FPR128RegClass); 2700 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2701 .addReg(SrcReg) 2702 .addReg(SrcReg, getKillRegState(KillSrc)); 2703 } else { 2704 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2705 &AArch64::FPR32RegClass); 2706 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2707 &AArch64::FPR32RegClass); 2708 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2709 .addReg(SrcReg, getKillRegState(KillSrc)); 2710 } 2711 return; 2712 } 2713 2714 if (AArch64::FPR8RegClass.contains(DestReg) && 2715 AArch64::FPR8RegClass.contains(SrcReg)) { 2716 if (Subtarget.hasNEON()) { 2717 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2718 &AArch64::FPR128RegClass); 2719 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2720 &AArch64::FPR128RegClass); 2721 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2722 .addReg(SrcReg) 2723 .addReg(SrcReg, getKillRegState(KillSrc)); 2724 } else { 2725 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2726 &AArch64::FPR32RegClass); 2727 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2728 &AArch64::FPR32RegClass); 2729 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2730 .addReg(SrcReg, getKillRegState(KillSrc)); 2731 } 2732 return; 2733 } 2734 2735 // Copies between GPR64 and FPR64. 2736 if (AArch64::FPR64RegClass.contains(DestReg) && 2737 AArch64::GPR64RegClass.contains(SrcReg)) { 2738 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 2739 .addReg(SrcReg, getKillRegState(KillSrc)); 2740 return; 2741 } 2742 if (AArch64::GPR64RegClass.contains(DestReg) && 2743 AArch64::FPR64RegClass.contains(SrcReg)) { 2744 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 2745 .addReg(SrcReg, getKillRegState(KillSrc)); 2746 return; 2747 } 2748 // Copies between GPR32 and FPR32. 2749 if (AArch64::FPR32RegClass.contains(DestReg) && 2750 AArch64::GPR32RegClass.contains(SrcReg)) { 2751 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 2752 .addReg(SrcReg, getKillRegState(KillSrc)); 2753 return; 2754 } 2755 if (AArch64::GPR32RegClass.contains(DestReg) && 2756 AArch64::FPR32RegClass.contains(SrcReg)) { 2757 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 2758 .addReg(SrcReg, getKillRegState(KillSrc)); 2759 return; 2760 } 2761 2762 if (DestReg == AArch64::NZCV) { 2763 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 2764 BuildMI(MBB, I, DL, get(AArch64::MSR)) 2765 .addImm(AArch64SysReg::NZCV) 2766 .addReg(SrcReg, getKillRegState(KillSrc)) 2767 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 2768 return; 2769 } 2770 2771 if (SrcReg == AArch64::NZCV) { 2772 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 2773 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 2774 .addImm(AArch64SysReg::NZCV) 2775 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 2776 return; 2777 } 2778 2779 llvm_unreachable("unimplemented reg-to-reg copy"); 2780 } 2781 2782 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 2783 MachineBasicBlock &MBB, 2784 MachineBasicBlock::iterator InsertBefore, 2785 const MCInstrDesc &MCID, 2786 unsigned SrcReg, bool IsKill, 2787 unsigned SubIdx0, unsigned SubIdx1, int FI, 2788 MachineMemOperand *MMO) { 2789 unsigned SrcReg0 = SrcReg; 2790 unsigned SrcReg1 = SrcReg; 2791 if (Register::isPhysicalRegister(SrcReg)) { 2792 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 2793 SubIdx0 = 0; 2794 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 2795 SubIdx1 = 0; 2796 } 2797 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 2798 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 2799 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 2800 .addFrameIndex(FI) 2801 .addImm(0) 2802 .addMemOperand(MMO); 2803 } 2804 2805 void AArch64InstrInfo::storeRegToStackSlot( 2806 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, 2807 bool isKill, int FI, const TargetRegisterClass *RC, 2808 const TargetRegisterInfo *TRI) const { 2809 MachineFunction &MF = *MBB.getParent(); 2810 MachineFrameInfo &MFI = MF.getFrameInfo(); 2811 unsigned Align = MFI.getObjectAlignment(FI); 2812 2813 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 2814 MachineMemOperand *MMO = MF.getMachineMemOperand( 2815 PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); 2816 unsigned Opc = 0; 2817 bool Offset = true; 2818 switch (TRI->getSpillSize(*RC)) { 2819 case 1: 2820 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 2821 Opc = AArch64::STRBui; 2822 break; 2823 case 2: 2824 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 2825 Opc = AArch64::STRHui; 2826 break; 2827 case 4: 2828 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 2829 Opc = AArch64::STRWui; 2830 if (Register::isVirtualRegister(SrcReg)) 2831 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 2832 else 2833 assert(SrcReg != AArch64::WSP); 2834 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 2835 Opc = AArch64::STRSui; 2836 break; 2837 case 8: 2838 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 2839 Opc = AArch64::STRXui; 2840 if (Register::isVirtualRegister(SrcReg)) 2841 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 2842 else 2843 assert(SrcReg != AArch64::SP); 2844 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 2845 Opc = AArch64::STRDui; 2846 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 2847 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2848 get(AArch64::STPWi), SrcReg, isKill, 2849 AArch64::sube32, AArch64::subo32, FI, MMO); 2850 return; 2851 } 2852 break; 2853 case 16: 2854 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 2855 Opc = AArch64::STRQui; 2856 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 2857 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2858 Opc = AArch64::ST1Twov1d; 2859 Offset = false; 2860 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 2861 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2862 get(AArch64::STPXi), SrcReg, isKill, 2863 AArch64::sube64, AArch64::subo64, FI, MMO); 2864 return; 2865 } 2866 break; 2867 case 24: 2868 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 2869 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2870 Opc = AArch64::ST1Threev1d; 2871 Offset = false; 2872 } 2873 break; 2874 case 32: 2875 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 2876 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2877 Opc = AArch64::ST1Fourv1d; 2878 Offset = false; 2879 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 2880 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2881 Opc = AArch64::ST1Twov2d; 2882 Offset = false; 2883 } 2884 break; 2885 case 48: 2886 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 2887 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2888 Opc = AArch64::ST1Threev2d; 2889 Offset = false; 2890 } 2891 break; 2892 case 64: 2893 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 2894 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2895 Opc = AArch64::ST1Fourv2d; 2896 Offset = false; 2897 } 2898 break; 2899 } 2900 assert(Opc && "Unknown register class"); 2901 2902 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 2903 .addReg(SrcReg, getKillRegState(isKill)) 2904 .addFrameIndex(FI); 2905 2906 if (Offset) 2907 MI.addImm(0); 2908 MI.addMemOperand(MMO); 2909 } 2910 2911 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 2912 MachineBasicBlock &MBB, 2913 MachineBasicBlock::iterator InsertBefore, 2914 const MCInstrDesc &MCID, 2915 unsigned DestReg, unsigned SubIdx0, 2916 unsigned SubIdx1, int FI, 2917 MachineMemOperand *MMO) { 2918 unsigned DestReg0 = DestReg; 2919 unsigned DestReg1 = DestReg; 2920 bool IsUndef = true; 2921 if (Register::isPhysicalRegister(DestReg)) { 2922 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 2923 SubIdx0 = 0; 2924 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 2925 SubIdx1 = 0; 2926 IsUndef = false; 2927 } 2928 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 2929 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 2930 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 2931 .addFrameIndex(FI) 2932 .addImm(0) 2933 .addMemOperand(MMO); 2934 } 2935 2936 void AArch64InstrInfo::loadRegFromStackSlot( 2937 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, 2938 int FI, const TargetRegisterClass *RC, 2939 const TargetRegisterInfo *TRI) const { 2940 MachineFunction &MF = *MBB.getParent(); 2941 MachineFrameInfo &MFI = MF.getFrameInfo(); 2942 unsigned Align = MFI.getObjectAlignment(FI); 2943 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 2944 MachineMemOperand *MMO = MF.getMachineMemOperand( 2945 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); 2946 2947 unsigned Opc = 0; 2948 bool Offset = true; 2949 switch (TRI->getSpillSize(*RC)) { 2950 case 1: 2951 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 2952 Opc = AArch64::LDRBui; 2953 break; 2954 case 2: 2955 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 2956 Opc = AArch64::LDRHui; 2957 break; 2958 case 4: 2959 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 2960 Opc = AArch64::LDRWui; 2961 if (Register::isVirtualRegister(DestReg)) 2962 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 2963 else 2964 assert(DestReg != AArch64::WSP); 2965 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 2966 Opc = AArch64::LDRSui; 2967 break; 2968 case 8: 2969 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 2970 Opc = AArch64::LDRXui; 2971 if (Register::isVirtualRegister(DestReg)) 2972 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 2973 else 2974 assert(DestReg != AArch64::SP); 2975 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 2976 Opc = AArch64::LDRDui; 2977 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 2978 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 2979 get(AArch64::LDPWi), DestReg, AArch64::sube32, 2980 AArch64::subo32, FI, MMO); 2981 return; 2982 } 2983 break; 2984 case 16: 2985 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 2986 Opc = AArch64::LDRQui; 2987 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 2988 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 2989 Opc = AArch64::LD1Twov1d; 2990 Offset = false; 2991 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 2992 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 2993 get(AArch64::LDPXi), DestReg, AArch64::sube64, 2994 AArch64::subo64, FI, MMO); 2995 return; 2996 } 2997 break; 2998 case 24: 2999 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3000 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3001 Opc = AArch64::LD1Threev1d; 3002 Offset = false; 3003 } 3004 break; 3005 case 32: 3006 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3007 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3008 Opc = AArch64::LD1Fourv1d; 3009 Offset = false; 3010 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3011 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3012 Opc = AArch64::LD1Twov2d; 3013 Offset = false; 3014 } 3015 break; 3016 case 48: 3017 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3018 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3019 Opc = AArch64::LD1Threev2d; 3020 Offset = false; 3021 } 3022 break; 3023 case 64: 3024 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3025 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3026 Opc = AArch64::LD1Fourv2d; 3027 Offset = false; 3028 } 3029 break; 3030 } 3031 assert(Opc && "Unknown register class"); 3032 3033 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3034 .addReg(DestReg, getDefRegState(true)) 3035 .addFrameIndex(FI); 3036 if (Offset) 3037 MI.addImm(0); 3038 MI.addMemOperand(MMO); 3039 } 3040 3041 // Helper function to emit a frame offset adjustment from a given 3042 // pointer (SrcReg), stored into DestReg. This function is explicit 3043 // in that it requires the opcode. 3044 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3045 MachineBasicBlock::iterator MBBI, 3046 const DebugLoc &DL, unsigned DestReg, 3047 unsigned SrcReg, int64_t Offset, unsigned Opc, 3048 const TargetInstrInfo *TII, 3049 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3050 bool *HasWinCFI) { 3051 int Sign = 1; 3052 unsigned MaxEncoding, ShiftSize; 3053 switch (Opc) { 3054 case AArch64::ADDXri: 3055 case AArch64::ADDSXri: 3056 case AArch64::SUBXri: 3057 case AArch64::SUBSXri: 3058 MaxEncoding = 0xfff; 3059 ShiftSize = 12; 3060 break; 3061 case AArch64::ADDVL_XXI: 3062 case AArch64::ADDPL_XXI: 3063 MaxEncoding = 31; 3064 ShiftSize = 0; 3065 if (Offset < 0) { 3066 MaxEncoding = 32; 3067 Sign = -1; 3068 Offset = -Offset; 3069 } 3070 break; 3071 default: 3072 llvm_unreachable("Unsupported opcode"); 3073 } 3074 3075 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3076 // scratch register. If DestReg is a virtual register, use it as the 3077 // scratch register; otherwise, create a new virtual register (to be 3078 // replaced by the scavenger at the end of PEI). That case can be optimized 3079 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3080 // register can be loaded with offset%8 and the add/sub can use an extending 3081 // instruction with LSL#3. 3082 // Currently the function handles any offsets but generates a poor sequence 3083 // of code. 3084 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3085 3086 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3087 do { 3088 unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue); 3089 unsigned LocalShiftSize = 0; 3090 if (ThisVal > MaxEncoding) { 3091 ThisVal = ThisVal >> ShiftSize; 3092 LocalShiftSize = ShiftSize; 3093 } 3094 assert((ThisVal >> ShiftSize) <= MaxEncoding && 3095 "Encoding cannot handle value that big"); 3096 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) 3097 .addReg(SrcReg) 3098 .addImm(Sign * (int)ThisVal); 3099 if (ShiftSize) 3100 MBI = MBI.addImm( 3101 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 3102 MBI = MBI.setMIFlag(Flag); 3103 3104 if (NeedsWinCFI) { 3105 assert(Sign == 1 && "SEH directives should always have a positive sign"); 3106 int Imm = (int)(ThisVal << LocalShiftSize); 3107 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 3108 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 3109 if (HasWinCFI) 3110 *HasWinCFI = true; 3111 if (Imm == 0) 3112 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 3113 else 3114 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 3115 .addImm(Imm) 3116 .setMIFlag(Flag); 3117 assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " 3118 "emit a single SEH directive"); 3119 } else if (DestReg == AArch64::SP) { 3120 if (HasWinCFI) 3121 *HasWinCFI = true; 3122 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 3123 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 3124 .addImm(Imm) 3125 .setMIFlag(Flag); 3126 } 3127 if (HasWinCFI) 3128 *HasWinCFI = true; 3129 } 3130 3131 SrcReg = DestReg; 3132 Offset -= ThisVal << LocalShiftSize; 3133 } while (Offset); 3134 } 3135 3136 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 3137 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 3138 unsigned DestReg, unsigned SrcReg, 3139 StackOffset Offset, const TargetInstrInfo *TII, 3140 MachineInstr::MIFlag Flag, bool SetNZCV, 3141 bool NeedsWinCFI, bool *HasWinCFI) { 3142 int64_t Bytes, NumPredicateVectors, NumDataVectors; 3143 Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); 3144 3145 // First emit non-scalable frame offsets, or a simple 'mov'. 3146 if (Bytes || (!Offset && SrcReg != DestReg)) { 3147 assert((DestReg != AArch64::SP || Bytes % 16 == 0) && 3148 "SP increment/decrement not 16-byte aligned"); 3149 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 3150 if (Bytes < 0) { 3151 Bytes = -Bytes; 3152 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 3153 } 3154 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 3155 NeedsWinCFI, HasWinCFI); 3156 SrcReg = DestReg; 3157 } 3158 3159 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 3160 "SetNZCV not supported with SVE vectors"); 3161 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 3162 "WinCFI not supported with SVE vectors"); 3163 3164 if (NumDataVectors) { 3165 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 3166 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3167 SrcReg = DestReg; 3168 } 3169 3170 if (NumPredicateVectors) { 3171 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 3172 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 3173 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3174 } 3175 } 3176 3177 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 3178 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 3179 MachineBasicBlock::iterator InsertPt, int FrameIndex, 3180 LiveIntervals *LIS, VirtRegMap *VRM) const { 3181 // This is a bit of a hack. Consider this instruction: 3182 // 3183 // %0 = COPY %sp; GPR64all:%0 3184 // 3185 // We explicitly chose GPR64all for the virtual register so such a copy might 3186 // be eliminated by RegisterCoalescer. However, that may not be possible, and 3187 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 3188 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 3189 // 3190 // To prevent that, we are going to constrain the %0 register class here. 3191 // 3192 // <rdar://problem/11522048> 3193 // 3194 if (MI.isFullCopy()) { 3195 Register DstReg = MI.getOperand(0).getReg(); 3196 Register SrcReg = MI.getOperand(1).getReg(); 3197 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 3198 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 3199 return nullptr; 3200 } 3201 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 3202 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3203 return nullptr; 3204 } 3205 } 3206 3207 // Handle the case where a copy is being spilled or filled but the source 3208 // and destination register class don't match. For example: 3209 // 3210 // %0 = COPY %xzr; GPR64common:%0 3211 // 3212 // In this case we can still safely fold away the COPY and generate the 3213 // following spill code: 3214 // 3215 // STRXui %xzr, %stack.0 3216 // 3217 // This also eliminates spilled cross register class COPYs (e.g. between x and 3218 // d regs) of the same size. For example: 3219 // 3220 // %0 = COPY %1; GPR64:%0, FPR64:%1 3221 // 3222 // will be filled as 3223 // 3224 // LDRDui %0, fi<#0> 3225 // 3226 // instead of 3227 // 3228 // LDRXui %Temp, fi<#0> 3229 // %0 = FMOV %Temp 3230 // 3231 if (MI.isCopy() && Ops.size() == 1 && 3232 // Make sure we're only folding the explicit COPY defs/uses. 3233 (Ops[0] == 0 || Ops[0] == 1)) { 3234 bool IsSpill = Ops[0] == 0; 3235 bool IsFill = !IsSpill; 3236 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 3237 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3238 MachineBasicBlock &MBB = *MI.getParent(); 3239 const MachineOperand &DstMO = MI.getOperand(0); 3240 const MachineOperand &SrcMO = MI.getOperand(1); 3241 Register DstReg = DstMO.getReg(); 3242 Register SrcReg = SrcMO.getReg(); 3243 // This is slightly expensive to compute for physical regs since 3244 // getMinimalPhysRegClass is slow. 3245 auto getRegClass = [&](unsigned Reg) { 3246 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 3247 : TRI.getMinimalPhysRegClass(Reg); 3248 }; 3249 3250 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 3251 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 3252 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 3253 "Mismatched register size in non subreg COPY"); 3254 if (IsSpill) 3255 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 3256 getRegClass(SrcReg), &TRI); 3257 else 3258 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 3259 getRegClass(DstReg), &TRI); 3260 return &*--InsertPt; 3261 } 3262 3263 // Handle cases like spilling def of: 3264 // 3265 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 3266 // 3267 // where the physical register source can be widened and stored to the full 3268 // virtual reg destination stack slot, in this case producing: 3269 // 3270 // STRXui %xzr, %stack.0 3271 // 3272 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 3273 assert(SrcMO.getSubReg() == 0 && 3274 "Unexpected subreg on physical register"); 3275 const TargetRegisterClass *SpillRC; 3276 unsigned SpillSubreg; 3277 switch (DstMO.getSubReg()) { 3278 default: 3279 SpillRC = nullptr; 3280 break; 3281 case AArch64::sub_32: 3282 case AArch64::ssub: 3283 if (AArch64::GPR32RegClass.contains(SrcReg)) { 3284 SpillRC = &AArch64::GPR64RegClass; 3285 SpillSubreg = AArch64::sub_32; 3286 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 3287 SpillRC = &AArch64::FPR64RegClass; 3288 SpillSubreg = AArch64::ssub; 3289 } else 3290 SpillRC = nullptr; 3291 break; 3292 case AArch64::dsub: 3293 if (AArch64::FPR64RegClass.contains(SrcReg)) { 3294 SpillRC = &AArch64::FPR128RegClass; 3295 SpillSubreg = AArch64::dsub; 3296 } else 3297 SpillRC = nullptr; 3298 break; 3299 } 3300 3301 if (SpillRC) 3302 if (unsigned WidenedSrcReg = 3303 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 3304 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 3305 FrameIndex, SpillRC, &TRI); 3306 return &*--InsertPt; 3307 } 3308 } 3309 3310 // Handle cases like filling use of: 3311 // 3312 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 3313 // 3314 // where we can load the full virtual reg source stack slot, into the subreg 3315 // destination, in this case producing: 3316 // 3317 // LDRWui %0:sub_32<def,read-undef>, %stack.0 3318 // 3319 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 3320 const TargetRegisterClass *FillRC; 3321 switch (DstMO.getSubReg()) { 3322 default: 3323 FillRC = nullptr; 3324 break; 3325 case AArch64::sub_32: 3326 FillRC = &AArch64::GPR32RegClass; 3327 break; 3328 case AArch64::ssub: 3329 FillRC = &AArch64::FPR32RegClass; 3330 break; 3331 case AArch64::dsub: 3332 FillRC = &AArch64::FPR64RegClass; 3333 break; 3334 } 3335 3336 if (FillRC) { 3337 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 3338 TRI.getRegSizeInBits(*FillRC) && 3339 "Mismatched regclass size on folded subreg COPY"); 3340 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 3341 MachineInstr &LoadMI = *--InsertPt; 3342 MachineOperand &LoadDst = LoadMI.getOperand(0); 3343 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 3344 LoadDst.setSubReg(DstMO.getSubReg()); 3345 LoadDst.setIsUndef(); 3346 return &LoadMI; 3347 } 3348 } 3349 } 3350 3351 // Cannot fold. 3352 return nullptr; 3353 } 3354 3355 static bool isSVEScaledImmInstruction(unsigned Opcode) { 3356 switch (Opcode) { 3357 case AArch64::LDR_ZXI: 3358 case AArch64::STR_ZXI: 3359 case AArch64::LDR_PXI: 3360 case AArch64::STR_PXI: 3361 return true; 3362 default: 3363 return false; 3364 } 3365 } 3366 3367 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 3368 StackOffset &SOffset, 3369 bool *OutUseUnscaledOp, 3370 unsigned *OutUnscaledOp, 3371 int64_t *EmittableOffset) { 3372 // Set output values in case of early exit. 3373 if (EmittableOffset) 3374 *EmittableOffset = 0; 3375 if (OutUseUnscaledOp) 3376 *OutUseUnscaledOp = false; 3377 if (OutUnscaledOp) 3378 *OutUnscaledOp = 0; 3379 3380 // Exit early for structured vector spills/fills as they can't take an 3381 // immediate offset. 3382 switch (MI.getOpcode()) { 3383 default: 3384 break; 3385 case AArch64::LD1Twov2d: 3386 case AArch64::LD1Threev2d: 3387 case AArch64::LD1Fourv2d: 3388 case AArch64::LD1Twov1d: 3389 case AArch64::LD1Threev1d: 3390 case AArch64::LD1Fourv1d: 3391 case AArch64::ST1Twov2d: 3392 case AArch64::ST1Threev2d: 3393 case AArch64::ST1Fourv2d: 3394 case AArch64::ST1Twov1d: 3395 case AArch64::ST1Threev1d: 3396 case AArch64::ST1Fourv1d: 3397 case AArch64::IRG: 3398 case AArch64::IRGstack: 3399 return AArch64FrameOffsetCannotUpdate; 3400 } 3401 3402 // Get the min/max offset and the scale. 3403 unsigned Scale, Width; 3404 int64_t MinOff, MaxOff; 3405 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, 3406 MaxOff)) 3407 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3408 3409 // Construct the complete offset. 3410 bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); 3411 int64_t Offset = 3412 IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); 3413 3414 const MachineOperand &ImmOpnd = 3415 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 3416 Offset += ImmOpnd.getImm() * Scale; 3417 3418 // If the offset doesn't match the scale, we rewrite the instruction to 3419 // use the unscaled instruction instead. Likewise, if we have a negative 3420 // offset and there is an unscaled op to use. 3421 Optional<unsigned> UnscaledOp = 3422 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 3423 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 3424 if (useUnscaledOp && 3425 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) 3426 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3427 3428 int64_t Remainder = Offset % Scale; 3429 assert(!(Remainder && useUnscaledOp) && 3430 "Cannot have remainder when using unscaled op"); 3431 3432 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 3433 int64_t NewOffset = Offset / Scale; 3434 if (MinOff <= NewOffset && NewOffset <= MaxOff) 3435 Offset = Remainder; 3436 else { 3437 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 3438 Offset = Offset - NewOffset * Scale + Remainder; 3439 } 3440 3441 if (EmittableOffset) 3442 *EmittableOffset = NewOffset; 3443 if (OutUseUnscaledOp) 3444 *OutUseUnscaledOp = useUnscaledOp; 3445 if (OutUnscaledOp && UnscaledOp) 3446 *OutUnscaledOp = *UnscaledOp; 3447 3448 if (IsMulVL) 3449 SOffset = StackOffset(Offset, MVT::nxv1i8) + 3450 StackOffset(SOffset.getBytes(), MVT::i8); 3451 else 3452 SOffset = StackOffset(Offset, MVT::i8) + 3453 StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8); 3454 return AArch64FrameOffsetCanUpdate | 3455 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 3456 } 3457 3458 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 3459 unsigned FrameReg, StackOffset &Offset, 3460 const AArch64InstrInfo *TII) { 3461 unsigned Opcode = MI.getOpcode(); 3462 unsigned ImmIdx = FrameRegIdx + 1; 3463 3464 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 3465 Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8); 3466 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 3467 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 3468 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 3469 MI.eraseFromParent(); 3470 Offset = StackOffset(); 3471 return true; 3472 } 3473 3474 int64_t NewOffset; 3475 unsigned UnscaledOp; 3476 bool UseUnscaledOp; 3477 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 3478 &UnscaledOp, &NewOffset); 3479 if (Status & AArch64FrameOffsetCanUpdate) { 3480 if (Status & AArch64FrameOffsetIsLegal) 3481 // Replace the FrameIndex with FrameReg. 3482 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 3483 if (UseUnscaledOp) 3484 MI.setDesc(TII->get(UnscaledOp)); 3485 3486 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 3487 return !Offset; 3488 } 3489 3490 return false; 3491 } 3492 3493 void AArch64InstrInfo::getNoop(MCInst &NopInst) const { 3494 NopInst.setOpcode(AArch64::HINT); 3495 NopInst.addOperand(MCOperand::createImm(0)); 3496 } 3497 3498 // AArch64 supports MachineCombiner. 3499 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 3500 3501 // True when Opc sets flag 3502 static bool isCombineInstrSettingFlag(unsigned Opc) { 3503 switch (Opc) { 3504 case AArch64::ADDSWrr: 3505 case AArch64::ADDSWri: 3506 case AArch64::ADDSXrr: 3507 case AArch64::ADDSXri: 3508 case AArch64::SUBSWrr: 3509 case AArch64::SUBSXrr: 3510 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3511 case AArch64::SUBSWri: 3512 case AArch64::SUBSXri: 3513 return true; 3514 default: 3515 break; 3516 } 3517 return false; 3518 } 3519 3520 // 32b Opcodes that can be combined with a MUL 3521 static bool isCombineInstrCandidate32(unsigned Opc) { 3522 switch (Opc) { 3523 case AArch64::ADDWrr: 3524 case AArch64::ADDWri: 3525 case AArch64::SUBWrr: 3526 case AArch64::ADDSWrr: 3527 case AArch64::ADDSWri: 3528 case AArch64::SUBSWrr: 3529 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3530 case AArch64::SUBWri: 3531 case AArch64::SUBSWri: 3532 return true; 3533 default: 3534 break; 3535 } 3536 return false; 3537 } 3538 3539 // 64b Opcodes that can be combined with a MUL 3540 static bool isCombineInstrCandidate64(unsigned Opc) { 3541 switch (Opc) { 3542 case AArch64::ADDXrr: 3543 case AArch64::ADDXri: 3544 case AArch64::SUBXrr: 3545 case AArch64::ADDSXrr: 3546 case AArch64::ADDSXri: 3547 case AArch64::SUBSXrr: 3548 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3549 case AArch64::SUBXri: 3550 case AArch64::SUBSXri: 3551 return true; 3552 default: 3553 break; 3554 } 3555 return false; 3556 } 3557 3558 // FP Opcodes that can be combined with a FMUL 3559 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 3560 switch (Inst.getOpcode()) { 3561 default: 3562 break; 3563 case AArch64::FADDHrr: 3564 case AArch64::FADDSrr: 3565 case AArch64::FADDDrr: 3566 case AArch64::FADDv4f16: 3567 case AArch64::FADDv8f16: 3568 case AArch64::FADDv2f32: 3569 case AArch64::FADDv2f64: 3570 case AArch64::FADDv4f32: 3571 case AArch64::FSUBHrr: 3572 case AArch64::FSUBSrr: 3573 case AArch64::FSUBDrr: 3574 case AArch64::FSUBv4f16: 3575 case AArch64::FSUBv8f16: 3576 case AArch64::FSUBv2f32: 3577 case AArch64::FSUBv2f64: 3578 case AArch64::FSUBv4f32: 3579 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 3580 return (Options.UnsafeFPMath || 3581 Options.AllowFPOpFusion == FPOpFusion::Fast); 3582 } 3583 return false; 3584 } 3585 3586 // Opcodes that can be combined with a MUL 3587 static bool isCombineInstrCandidate(unsigned Opc) { 3588 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 3589 } 3590 3591 // 3592 // Utility routine that checks if \param MO is defined by an 3593 // \param CombineOpc instruction in the basic block \param MBB 3594 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 3595 unsigned CombineOpc, unsigned ZeroReg = 0, 3596 bool CheckZeroReg = false) { 3597 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3598 MachineInstr *MI = nullptr; 3599 3600 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 3601 MI = MRI.getUniqueVRegDef(MO.getReg()); 3602 // And it needs to be in the trace (otherwise, it won't have a depth). 3603 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 3604 return false; 3605 // Must only used by the user we combine with. 3606 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 3607 return false; 3608 3609 if (CheckZeroReg) { 3610 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 3611 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 3612 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 3613 // The third input reg must be zero. 3614 if (MI->getOperand(3).getReg() != ZeroReg) 3615 return false; 3616 } 3617 3618 return true; 3619 } 3620 3621 // 3622 // Is \param MO defined by an integer multiply and can be combined? 3623 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3624 unsigned MulOpc, unsigned ZeroReg) { 3625 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 3626 } 3627 3628 // 3629 // Is \param MO defined by a floating-point multiply and can be combined? 3630 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3631 unsigned MulOpc) { 3632 return canCombine(MBB, MO, MulOpc); 3633 } 3634 3635 // TODO: There are many more machine instruction opcodes to match: 3636 // 1. Other data types (integer, vectors) 3637 // 2. Other math / logic operations (xor, or) 3638 // 3. Other forms of the same operation (intrinsics and other variants) 3639 bool AArch64InstrInfo::isAssociativeAndCommutative( 3640 const MachineInstr &Inst) const { 3641 switch (Inst.getOpcode()) { 3642 case AArch64::FADDDrr: 3643 case AArch64::FADDSrr: 3644 case AArch64::FADDv2f32: 3645 case AArch64::FADDv2f64: 3646 case AArch64::FADDv4f32: 3647 case AArch64::FMULDrr: 3648 case AArch64::FMULSrr: 3649 case AArch64::FMULX32: 3650 case AArch64::FMULX64: 3651 case AArch64::FMULXv2f32: 3652 case AArch64::FMULXv2f64: 3653 case AArch64::FMULXv4f32: 3654 case AArch64::FMULv2f32: 3655 case AArch64::FMULv2f64: 3656 case AArch64::FMULv4f32: 3657 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 3658 default: 3659 return false; 3660 } 3661 } 3662 3663 /// Find instructions that can be turned into madd. 3664 static bool getMaddPatterns(MachineInstr &Root, 3665 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3666 unsigned Opc = Root.getOpcode(); 3667 MachineBasicBlock &MBB = *Root.getParent(); 3668 bool Found = false; 3669 3670 if (!isCombineInstrCandidate(Opc)) 3671 return false; 3672 if (isCombineInstrSettingFlag(Opc)) { 3673 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 3674 // When NZCV is live bail out. 3675 if (Cmp_NZCV == -1) 3676 return false; 3677 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 3678 // When opcode can't change bail out. 3679 // CHECKME: do we miss any cases for opcode conversion? 3680 if (NewOpc == Opc) 3681 return false; 3682 Opc = NewOpc; 3683 } 3684 3685 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 3686 MachineCombinerPattern Pattern) { 3687 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 3688 Patterns.push_back(Pattern); 3689 Found = true; 3690 } 3691 }; 3692 3693 typedef MachineCombinerPattern MCP; 3694 3695 switch (Opc) { 3696 default: 3697 break; 3698 case AArch64::ADDWrr: 3699 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3700 "ADDWrr does not have register operands"); 3701 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 3702 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 3703 break; 3704 case AArch64::ADDXrr: 3705 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 3706 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 3707 break; 3708 case AArch64::SUBWrr: 3709 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 3710 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 3711 break; 3712 case AArch64::SUBXrr: 3713 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 3714 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 3715 break; 3716 case AArch64::ADDWri: 3717 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 3718 break; 3719 case AArch64::ADDXri: 3720 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 3721 break; 3722 case AArch64::SUBWri: 3723 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 3724 break; 3725 case AArch64::SUBXri: 3726 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 3727 break; 3728 } 3729 return Found; 3730 } 3731 /// Floating-Point Support 3732 3733 /// Find instructions that can be turned into madd. 3734 static bool getFMAPatterns(MachineInstr &Root, 3735 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3736 3737 if (!isCombineInstrCandidateFP(Root)) 3738 return false; 3739 3740 MachineBasicBlock &MBB = *Root.getParent(); 3741 bool Found = false; 3742 3743 auto Match = [&](int Opcode, int Operand, 3744 MachineCombinerPattern Pattern) -> bool { 3745 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 3746 Patterns.push_back(Pattern); 3747 return true; 3748 } 3749 return false; 3750 }; 3751 3752 typedef MachineCombinerPattern MCP; 3753 3754 switch (Root.getOpcode()) { 3755 default: 3756 assert(false && "Unsupported FP instruction in combiner\n"); 3757 break; 3758 case AArch64::FADDHrr: 3759 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3760 "FADDHrr does not have register operands"); 3761 3762 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 3763 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 3764 break; 3765 case AArch64::FADDSrr: 3766 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3767 "FADDSrr does not have register operands"); 3768 3769 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 3770 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 3771 3772 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 3773 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 3774 break; 3775 case AArch64::FADDDrr: 3776 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 3777 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 3778 3779 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 3780 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 3781 break; 3782 case AArch64::FADDv4f16: 3783 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 3784 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 3785 3786 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 3787 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 3788 break; 3789 case AArch64::FADDv8f16: 3790 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 3791 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 3792 3793 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 3794 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 3795 break; 3796 case AArch64::FADDv2f32: 3797 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 3798 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 3799 3800 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 3801 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 3802 break; 3803 case AArch64::FADDv2f64: 3804 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 3805 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 3806 3807 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 3808 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 3809 break; 3810 case AArch64::FADDv4f32: 3811 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 3812 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 3813 3814 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 3815 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 3816 break; 3817 case AArch64::FSUBHrr: 3818 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 3819 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 3820 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 3821 break; 3822 case AArch64::FSUBSrr: 3823 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 3824 3825 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 3826 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 3827 3828 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 3829 break; 3830 case AArch64::FSUBDrr: 3831 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 3832 3833 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 3834 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 3835 3836 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 3837 break; 3838 case AArch64::FSUBv4f16: 3839 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 3840 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 3841 3842 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 3843 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 3844 break; 3845 case AArch64::FSUBv8f16: 3846 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 3847 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 3848 3849 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 3850 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 3851 break; 3852 case AArch64::FSUBv2f32: 3853 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 3854 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 3855 3856 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 3857 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 3858 break; 3859 case AArch64::FSUBv2f64: 3860 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 3861 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 3862 3863 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 3864 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 3865 break; 3866 case AArch64::FSUBv4f32: 3867 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 3868 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 3869 3870 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 3871 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 3872 break; 3873 } 3874 return Found; 3875 } 3876 3877 /// Return true when a code sequence can improve throughput. It 3878 /// should be called only for instructions in loops. 3879 /// \param Pattern - combiner pattern 3880 bool AArch64InstrInfo::isThroughputPattern( 3881 MachineCombinerPattern Pattern) const { 3882 switch (Pattern) { 3883 default: 3884 break; 3885 case MachineCombinerPattern::FMULADDH_OP1: 3886 case MachineCombinerPattern::FMULADDH_OP2: 3887 case MachineCombinerPattern::FMULSUBH_OP1: 3888 case MachineCombinerPattern::FMULSUBH_OP2: 3889 case MachineCombinerPattern::FMULADDS_OP1: 3890 case MachineCombinerPattern::FMULADDS_OP2: 3891 case MachineCombinerPattern::FMULSUBS_OP1: 3892 case MachineCombinerPattern::FMULSUBS_OP2: 3893 case MachineCombinerPattern::FMULADDD_OP1: 3894 case MachineCombinerPattern::FMULADDD_OP2: 3895 case MachineCombinerPattern::FMULSUBD_OP1: 3896 case MachineCombinerPattern::FMULSUBD_OP2: 3897 case MachineCombinerPattern::FNMULSUBH_OP1: 3898 case MachineCombinerPattern::FNMULSUBS_OP1: 3899 case MachineCombinerPattern::FNMULSUBD_OP1: 3900 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 3901 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 3902 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 3903 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 3904 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 3905 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 3906 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 3907 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 3908 case MachineCombinerPattern::FMLAv4f16_OP2: 3909 case MachineCombinerPattern::FMLAv4f16_OP1: 3910 case MachineCombinerPattern::FMLAv8f16_OP1: 3911 case MachineCombinerPattern::FMLAv8f16_OP2: 3912 case MachineCombinerPattern::FMLAv2f32_OP2: 3913 case MachineCombinerPattern::FMLAv2f32_OP1: 3914 case MachineCombinerPattern::FMLAv2f64_OP1: 3915 case MachineCombinerPattern::FMLAv2f64_OP2: 3916 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 3917 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 3918 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 3919 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 3920 case MachineCombinerPattern::FMLAv4f32_OP1: 3921 case MachineCombinerPattern::FMLAv4f32_OP2: 3922 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 3923 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 3924 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 3925 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 3926 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 3927 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 3928 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 3929 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 3930 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 3931 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 3932 case MachineCombinerPattern::FMLSv4f16_OP1: 3933 case MachineCombinerPattern::FMLSv4f16_OP2: 3934 case MachineCombinerPattern::FMLSv8f16_OP1: 3935 case MachineCombinerPattern::FMLSv8f16_OP2: 3936 case MachineCombinerPattern::FMLSv2f32_OP2: 3937 case MachineCombinerPattern::FMLSv2f64_OP2: 3938 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 3939 case MachineCombinerPattern::FMLSv4f32_OP2: 3940 return true; 3941 } // end switch (Pattern) 3942 return false; 3943 } 3944 /// Return true when there is potentially a faster code sequence for an 3945 /// instruction chain ending in \p Root. All potential patterns are listed in 3946 /// the \p Pattern vector. Pattern should be sorted in priority order since the 3947 /// pattern evaluator stops checking as soon as it finds a faster sequence. 3948 3949 bool AArch64InstrInfo::getMachineCombinerPatterns( 3950 MachineInstr &Root, 3951 SmallVectorImpl<MachineCombinerPattern> &Patterns) const { 3952 // Integer patterns 3953 if (getMaddPatterns(Root, Patterns)) 3954 return true; 3955 // Floating point patterns 3956 if (getFMAPatterns(Root, Patterns)) 3957 return true; 3958 3959 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); 3960 } 3961 3962 enum class FMAInstKind { Default, Indexed, Accumulator }; 3963 /// genFusedMultiply - Generate fused multiply instructions. 3964 /// This function supports both integer and floating point instructions. 3965 /// A typical example: 3966 /// F|MUL I=A,B,0 3967 /// F|ADD R,I,C 3968 /// ==> F|MADD R,A,B,C 3969 /// \param MF Containing MachineFunction 3970 /// \param MRI Register information 3971 /// \param TII Target information 3972 /// \param Root is the F|ADD instruction 3973 /// \param [out] InsInstrs is a vector of machine instructions and will 3974 /// contain the generated madd instruction 3975 /// \param IdxMulOpd is index of operand in Root that is the result of 3976 /// the F|MUL. In the example above IdxMulOpd is 1. 3977 /// \param MaddOpc the opcode fo the f|madd instruction 3978 /// \param RC Register class of operands 3979 /// \param kind of fma instruction (addressing mode) to be generated 3980 /// \param ReplacedAddend is the result register from the instruction 3981 /// replacing the non-combined operand, if any. 3982 static MachineInstr * 3983 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 3984 const TargetInstrInfo *TII, MachineInstr &Root, 3985 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 3986 unsigned MaddOpc, const TargetRegisterClass *RC, 3987 FMAInstKind kind = FMAInstKind::Default, 3988 const Register *ReplacedAddend = nullptr) { 3989 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 3990 3991 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 3992 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 3993 Register ResultReg = Root.getOperand(0).getReg(); 3994 Register SrcReg0 = MUL->getOperand(1).getReg(); 3995 bool Src0IsKill = MUL->getOperand(1).isKill(); 3996 Register SrcReg1 = MUL->getOperand(2).getReg(); 3997 bool Src1IsKill = MUL->getOperand(2).isKill(); 3998 3999 unsigned SrcReg2; 4000 bool Src2IsKill; 4001 if (ReplacedAddend) { 4002 // If we just generated a new addend, we must be it's only use. 4003 SrcReg2 = *ReplacedAddend; 4004 Src2IsKill = true; 4005 } else { 4006 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 4007 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 4008 } 4009 4010 if (Register::isVirtualRegister(ResultReg)) 4011 MRI.constrainRegClass(ResultReg, RC); 4012 if (Register::isVirtualRegister(SrcReg0)) 4013 MRI.constrainRegClass(SrcReg0, RC); 4014 if (Register::isVirtualRegister(SrcReg1)) 4015 MRI.constrainRegClass(SrcReg1, RC); 4016 if (Register::isVirtualRegister(SrcReg2)) 4017 MRI.constrainRegClass(SrcReg2, RC); 4018 4019 MachineInstrBuilder MIB; 4020 if (kind == FMAInstKind::Default) 4021 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4022 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4023 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4024 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 4025 else if (kind == FMAInstKind::Indexed) 4026 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4027 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4028 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4029 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4030 .addImm(MUL->getOperand(3).getImm()); 4031 else if (kind == FMAInstKind::Accumulator) 4032 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4033 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4034 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4035 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 4036 else 4037 assert(false && "Invalid FMA instruction kind \n"); 4038 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 4039 InsInstrs.push_back(MIB); 4040 return MUL; 4041 } 4042 4043 /// genMaddR - Generate madd instruction and combine mul and add using 4044 /// an extra virtual register 4045 /// Example - an ADD intermediate needs to be stored in a register: 4046 /// MUL I=A,B,0 4047 /// ADD R,I,Imm 4048 /// ==> ORR V, ZR, Imm 4049 /// ==> MADD R,A,B,V 4050 /// \param MF Containing MachineFunction 4051 /// \param MRI Register information 4052 /// \param TII Target information 4053 /// \param Root is the ADD instruction 4054 /// \param [out] InsInstrs is a vector of machine instructions and will 4055 /// contain the generated madd instruction 4056 /// \param IdxMulOpd is index of operand in Root that is the result of 4057 /// the MUL. In the example above IdxMulOpd is 1. 4058 /// \param MaddOpc the opcode fo the madd instruction 4059 /// \param VR is a virtual register that holds the value of an ADD operand 4060 /// (V in the example above). 4061 /// \param RC Register class of operands 4062 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 4063 const TargetInstrInfo *TII, MachineInstr &Root, 4064 SmallVectorImpl<MachineInstr *> &InsInstrs, 4065 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 4066 const TargetRegisterClass *RC) { 4067 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4068 4069 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4070 Register ResultReg = Root.getOperand(0).getReg(); 4071 Register SrcReg0 = MUL->getOperand(1).getReg(); 4072 bool Src0IsKill = MUL->getOperand(1).isKill(); 4073 Register SrcReg1 = MUL->getOperand(2).getReg(); 4074 bool Src1IsKill = MUL->getOperand(2).isKill(); 4075 4076 if (Register::isVirtualRegister(ResultReg)) 4077 MRI.constrainRegClass(ResultReg, RC); 4078 if (Register::isVirtualRegister(SrcReg0)) 4079 MRI.constrainRegClass(SrcReg0, RC); 4080 if (Register::isVirtualRegister(SrcReg1)) 4081 MRI.constrainRegClass(SrcReg1, RC); 4082 if (Register::isVirtualRegister(VR)) 4083 MRI.constrainRegClass(VR, RC); 4084 4085 MachineInstrBuilder MIB = 4086 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4087 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4088 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4089 .addReg(VR); 4090 // Insert the MADD 4091 InsInstrs.push_back(MIB); 4092 return MUL; 4093 } 4094 4095 /// When getMachineCombinerPatterns() finds potential patterns, 4096 /// this function generates the instructions that could replace the 4097 /// original code sequence 4098 void AArch64InstrInfo::genAlternativeCodeSequence( 4099 MachineInstr &Root, MachineCombinerPattern Pattern, 4100 SmallVectorImpl<MachineInstr *> &InsInstrs, 4101 SmallVectorImpl<MachineInstr *> &DelInstrs, 4102 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 4103 MachineBasicBlock &MBB = *Root.getParent(); 4104 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4105 MachineFunction &MF = *MBB.getParent(); 4106 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 4107 4108 MachineInstr *MUL; 4109 const TargetRegisterClass *RC; 4110 unsigned Opc; 4111 switch (Pattern) { 4112 default: 4113 // Reassociate instructions. 4114 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 4115 DelInstrs, InstrIdxForVirtReg); 4116 return; 4117 case MachineCombinerPattern::MULADDW_OP1: 4118 case MachineCombinerPattern::MULADDX_OP1: 4119 // MUL I=A,B,0 4120 // ADD R,I,C 4121 // ==> MADD R,A,B,C 4122 // --- Create(MADD); 4123 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 4124 Opc = AArch64::MADDWrrr; 4125 RC = &AArch64::GPR32RegClass; 4126 } else { 4127 Opc = AArch64::MADDXrrr; 4128 RC = &AArch64::GPR64RegClass; 4129 } 4130 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4131 break; 4132 case MachineCombinerPattern::MULADDW_OP2: 4133 case MachineCombinerPattern::MULADDX_OP2: 4134 // MUL I=A,B,0 4135 // ADD R,C,I 4136 // ==> MADD R,A,B,C 4137 // --- Create(MADD); 4138 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 4139 Opc = AArch64::MADDWrrr; 4140 RC = &AArch64::GPR32RegClass; 4141 } else { 4142 Opc = AArch64::MADDXrrr; 4143 RC = &AArch64::GPR64RegClass; 4144 } 4145 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4146 break; 4147 case MachineCombinerPattern::MULADDWI_OP1: 4148 case MachineCombinerPattern::MULADDXI_OP1: { 4149 // MUL I=A,B,0 4150 // ADD R,I,Imm 4151 // ==> ORR V, ZR, Imm 4152 // ==> MADD R,A,B,V 4153 // --- Create(MADD); 4154 const TargetRegisterClass *OrrRC; 4155 unsigned BitSize, OrrOpc, ZeroReg; 4156 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 4157 OrrOpc = AArch64::ORRWri; 4158 OrrRC = &AArch64::GPR32spRegClass; 4159 BitSize = 32; 4160 ZeroReg = AArch64::WZR; 4161 Opc = AArch64::MADDWrrr; 4162 RC = &AArch64::GPR32RegClass; 4163 } else { 4164 OrrOpc = AArch64::ORRXri; 4165 OrrRC = &AArch64::GPR64spRegClass; 4166 BitSize = 64; 4167 ZeroReg = AArch64::XZR; 4168 Opc = AArch64::MADDXrrr; 4169 RC = &AArch64::GPR64RegClass; 4170 } 4171 Register NewVR = MRI.createVirtualRegister(OrrRC); 4172 uint64_t Imm = Root.getOperand(2).getImm(); 4173 4174 if (Root.getOperand(3).isImm()) { 4175 unsigned Val = Root.getOperand(3).getImm(); 4176 Imm = Imm << Val; 4177 } 4178 uint64_t UImm = SignExtend64(Imm, BitSize); 4179 uint64_t Encoding; 4180 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4181 MachineInstrBuilder MIB1 = 4182 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4183 .addReg(ZeroReg) 4184 .addImm(Encoding); 4185 InsInstrs.push_back(MIB1); 4186 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4187 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4188 } 4189 break; 4190 } 4191 case MachineCombinerPattern::MULSUBW_OP1: 4192 case MachineCombinerPattern::MULSUBX_OP1: { 4193 // MUL I=A,B,0 4194 // SUB R,I, C 4195 // ==> SUB V, 0, C 4196 // ==> MADD R,A,B,V // = -C + A*B 4197 // --- Create(MADD); 4198 const TargetRegisterClass *SubRC; 4199 unsigned SubOpc, ZeroReg; 4200 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 4201 SubOpc = AArch64::SUBWrr; 4202 SubRC = &AArch64::GPR32spRegClass; 4203 ZeroReg = AArch64::WZR; 4204 Opc = AArch64::MADDWrrr; 4205 RC = &AArch64::GPR32RegClass; 4206 } else { 4207 SubOpc = AArch64::SUBXrr; 4208 SubRC = &AArch64::GPR64spRegClass; 4209 ZeroReg = AArch64::XZR; 4210 Opc = AArch64::MADDXrrr; 4211 RC = &AArch64::GPR64RegClass; 4212 } 4213 Register NewVR = MRI.createVirtualRegister(SubRC); 4214 // SUB NewVR, 0, C 4215 MachineInstrBuilder MIB1 = 4216 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 4217 .addReg(ZeroReg) 4218 .add(Root.getOperand(2)); 4219 InsInstrs.push_back(MIB1); 4220 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4221 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4222 break; 4223 } 4224 case MachineCombinerPattern::MULSUBW_OP2: 4225 case MachineCombinerPattern::MULSUBX_OP2: 4226 // MUL I=A,B,0 4227 // SUB R,C,I 4228 // ==> MSUB R,A,B,C (computes C - A*B) 4229 // --- Create(MSUB); 4230 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 4231 Opc = AArch64::MSUBWrrr; 4232 RC = &AArch64::GPR32RegClass; 4233 } else { 4234 Opc = AArch64::MSUBXrrr; 4235 RC = &AArch64::GPR64RegClass; 4236 } 4237 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4238 break; 4239 case MachineCombinerPattern::MULSUBWI_OP1: 4240 case MachineCombinerPattern::MULSUBXI_OP1: { 4241 // MUL I=A,B,0 4242 // SUB R,I, Imm 4243 // ==> ORR V, ZR, -Imm 4244 // ==> MADD R,A,B,V // = -Imm + A*B 4245 // --- Create(MADD); 4246 const TargetRegisterClass *OrrRC; 4247 unsigned BitSize, OrrOpc, ZeroReg; 4248 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 4249 OrrOpc = AArch64::ORRWri; 4250 OrrRC = &AArch64::GPR32spRegClass; 4251 BitSize = 32; 4252 ZeroReg = AArch64::WZR; 4253 Opc = AArch64::MADDWrrr; 4254 RC = &AArch64::GPR32RegClass; 4255 } else { 4256 OrrOpc = AArch64::ORRXri; 4257 OrrRC = &AArch64::GPR64spRegClass; 4258 BitSize = 64; 4259 ZeroReg = AArch64::XZR; 4260 Opc = AArch64::MADDXrrr; 4261 RC = &AArch64::GPR64RegClass; 4262 } 4263 Register NewVR = MRI.createVirtualRegister(OrrRC); 4264 uint64_t Imm = Root.getOperand(2).getImm(); 4265 if (Root.getOperand(3).isImm()) { 4266 unsigned Val = Root.getOperand(3).getImm(); 4267 Imm = Imm << Val; 4268 } 4269 uint64_t UImm = SignExtend64(-Imm, BitSize); 4270 uint64_t Encoding; 4271 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4272 MachineInstrBuilder MIB1 = 4273 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4274 .addReg(ZeroReg) 4275 .addImm(Encoding); 4276 InsInstrs.push_back(MIB1); 4277 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4278 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4279 } 4280 break; 4281 } 4282 // Floating Point Support 4283 case MachineCombinerPattern::FMULADDH_OP1: 4284 Opc = AArch64::FMADDHrrr; 4285 RC = &AArch64::FPR16RegClass; 4286 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4287 break; 4288 case MachineCombinerPattern::FMULADDS_OP1: 4289 Opc = AArch64::FMADDSrrr; 4290 RC = &AArch64::FPR32RegClass; 4291 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4292 break; 4293 case MachineCombinerPattern::FMULADDD_OP1: 4294 Opc = AArch64::FMADDDrrr; 4295 RC = &AArch64::FPR64RegClass; 4296 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4297 break; 4298 4299 case MachineCombinerPattern::FMULADDH_OP2: 4300 Opc = AArch64::FMADDHrrr; 4301 RC = &AArch64::FPR16RegClass; 4302 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4303 break; 4304 case MachineCombinerPattern::FMULADDS_OP2: 4305 Opc = AArch64::FMADDSrrr; 4306 RC = &AArch64::FPR32RegClass; 4307 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4308 break; 4309 case MachineCombinerPattern::FMULADDD_OP2: 4310 Opc = AArch64::FMADDDrrr; 4311 RC = &AArch64::FPR64RegClass; 4312 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4313 break; 4314 4315 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4316 Opc = AArch64::FMLAv1i32_indexed; 4317 RC = &AArch64::FPR32RegClass; 4318 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4319 FMAInstKind::Indexed); 4320 break; 4321 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4322 Opc = AArch64::FMLAv1i32_indexed; 4323 RC = &AArch64::FPR32RegClass; 4324 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4325 FMAInstKind::Indexed); 4326 break; 4327 4328 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4329 Opc = AArch64::FMLAv1i64_indexed; 4330 RC = &AArch64::FPR64RegClass; 4331 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4332 FMAInstKind::Indexed); 4333 break; 4334 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4335 Opc = AArch64::FMLAv1i64_indexed; 4336 RC = &AArch64::FPR64RegClass; 4337 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4338 FMAInstKind::Indexed); 4339 break; 4340 4341 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4342 RC = &AArch64::FPR64RegClass; 4343 Opc = AArch64::FMLAv4i16_indexed; 4344 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4345 FMAInstKind::Indexed); 4346 break; 4347 case MachineCombinerPattern::FMLAv4f16_OP1: 4348 RC = &AArch64::FPR64RegClass; 4349 Opc = AArch64::FMLAv4f16; 4350 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4351 FMAInstKind::Accumulator); 4352 break; 4353 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4354 RC = &AArch64::FPR64RegClass; 4355 Opc = AArch64::FMLAv4i16_indexed; 4356 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4357 FMAInstKind::Indexed); 4358 break; 4359 case MachineCombinerPattern::FMLAv4f16_OP2: 4360 RC = &AArch64::FPR64RegClass; 4361 Opc = AArch64::FMLAv4f16; 4362 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4363 FMAInstKind::Accumulator); 4364 break; 4365 4366 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4367 case MachineCombinerPattern::FMLAv2f32_OP1: 4368 RC = &AArch64::FPR64RegClass; 4369 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 4370 Opc = AArch64::FMLAv2i32_indexed; 4371 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4372 FMAInstKind::Indexed); 4373 } else { 4374 Opc = AArch64::FMLAv2f32; 4375 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4376 FMAInstKind::Accumulator); 4377 } 4378 break; 4379 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4380 case MachineCombinerPattern::FMLAv2f32_OP2: 4381 RC = &AArch64::FPR64RegClass; 4382 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 4383 Opc = AArch64::FMLAv2i32_indexed; 4384 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4385 FMAInstKind::Indexed); 4386 } else { 4387 Opc = AArch64::FMLAv2f32; 4388 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4389 FMAInstKind::Accumulator); 4390 } 4391 break; 4392 4393 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4394 RC = &AArch64::FPR128RegClass; 4395 Opc = AArch64::FMLAv8i16_indexed; 4396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4397 FMAInstKind::Indexed); 4398 break; 4399 case MachineCombinerPattern::FMLAv8f16_OP1: 4400 RC = &AArch64::FPR128RegClass; 4401 Opc = AArch64::FMLAv8f16; 4402 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4403 FMAInstKind::Accumulator); 4404 break; 4405 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4406 RC = &AArch64::FPR128RegClass; 4407 Opc = AArch64::FMLAv8i16_indexed; 4408 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4409 FMAInstKind::Indexed); 4410 break; 4411 case MachineCombinerPattern::FMLAv8f16_OP2: 4412 RC = &AArch64::FPR128RegClass; 4413 Opc = AArch64::FMLAv8f16; 4414 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4415 FMAInstKind::Accumulator); 4416 break; 4417 4418 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4419 case MachineCombinerPattern::FMLAv2f64_OP1: 4420 RC = &AArch64::FPR128RegClass; 4421 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 4422 Opc = AArch64::FMLAv2i64_indexed; 4423 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4424 FMAInstKind::Indexed); 4425 } else { 4426 Opc = AArch64::FMLAv2f64; 4427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4428 FMAInstKind::Accumulator); 4429 } 4430 break; 4431 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4432 case MachineCombinerPattern::FMLAv2f64_OP2: 4433 RC = &AArch64::FPR128RegClass; 4434 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 4435 Opc = AArch64::FMLAv2i64_indexed; 4436 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4437 FMAInstKind::Indexed); 4438 } else { 4439 Opc = AArch64::FMLAv2f64; 4440 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4441 FMAInstKind::Accumulator); 4442 } 4443 break; 4444 4445 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4446 case MachineCombinerPattern::FMLAv4f32_OP1: 4447 RC = &AArch64::FPR128RegClass; 4448 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 4449 Opc = AArch64::FMLAv4i32_indexed; 4450 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4451 FMAInstKind::Indexed); 4452 } else { 4453 Opc = AArch64::FMLAv4f32; 4454 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4455 FMAInstKind::Accumulator); 4456 } 4457 break; 4458 4459 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4460 case MachineCombinerPattern::FMLAv4f32_OP2: 4461 RC = &AArch64::FPR128RegClass; 4462 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 4463 Opc = AArch64::FMLAv4i32_indexed; 4464 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4465 FMAInstKind::Indexed); 4466 } else { 4467 Opc = AArch64::FMLAv4f32; 4468 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4469 FMAInstKind::Accumulator); 4470 } 4471 break; 4472 4473 case MachineCombinerPattern::FMULSUBH_OP1: 4474 Opc = AArch64::FNMSUBHrrr; 4475 RC = &AArch64::FPR16RegClass; 4476 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4477 break; 4478 case MachineCombinerPattern::FMULSUBS_OP1: 4479 Opc = AArch64::FNMSUBSrrr; 4480 RC = &AArch64::FPR32RegClass; 4481 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4482 break; 4483 case MachineCombinerPattern::FMULSUBD_OP1: 4484 Opc = AArch64::FNMSUBDrrr; 4485 RC = &AArch64::FPR64RegClass; 4486 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4487 break; 4488 4489 case MachineCombinerPattern::FNMULSUBH_OP1: 4490 Opc = AArch64::FNMADDHrrr; 4491 RC = &AArch64::FPR16RegClass; 4492 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4493 break; 4494 case MachineCombinerPattern::FNMULSUBS_OP1: 4495 Opc = AArch64::FNMADDSrrr; 4496 RC = &AArch64::FPR32RegClass; 4497 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4498 break; 4499 case MachineCombinerPattern::FNMULSUBD_OP1: 4500 Opc = AArch64::FNMADDDrrr; 4501 RC = &AArch64::FPR64RegClass; 4502 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4503 break; 4504 4505 case MachineCombinerPattern::FMULSUBH_OP2: 4506 Opc = AArch64::FMSUBHrrr; 4507 RC = &AArch64::FPR16RegClass; 4508 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4509 break; 4510 case MachineCombinerPattern::FMULSUBS_OP2: 4511 Opc = AArch64::FMSUBSrrr; 4512 RC = &AArch64::FPR32RegClass; 4513 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4514 break; 4515 case MachineCombinerPattern::FMULSUBD_OP2: 4516 Opc = AArch64::FMSUBDrrr; 4517 RC = &AArch64::FPR64RegClass; 4518 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4519 break; 4520 4521 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4522 Opc = AArch64::FMLSv1i32_indexed; 4523 RC = &AArch64::FPR32RegClass; 4524 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4525 FMAInstKind::Indexed); 4526 break; 4527 4528 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4529 Opc = AArch64::FMLSv1i64_indexed; 4530 RC = &AArch64::FPR64RegClass; 4531 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4532 FMAInstKind::Indexed); 4533 break; 4534 4535 case MachineCombinerPattern::FMLSv4f16_OP1: 4536 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 4537 RC = &AArch64::FPR64RegClass; 4538 Register NewVR = MRI.createVirtualRegister(RC); 4539 MachineInstrBuilder MIB1 = 4540 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 4541 .add(Root.getOperand(2)); 4542 InsInstrs.push_back(MIB1); 4543 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4544 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 4545 Opc = AArch64::FMLAv4f16; 4546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4547 FMAInstKind::Accumulator, &NewVR); 4548 } else { 4549 Opc = AArch64::FMLAv4i16_indexed; 4550 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4551 FMAInstKind::Indexed, &NewVR); 4552 } 4553 break; 4554 } 4555 case MachineCombinerPattern::FMLSv4f16_OP2: 4556 RC = &AArch64::FPR64RegClass; 4557 Opc = AArch64::FMLSv4f16; 4558 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4559 FMAInstKind::Accumulator); 4560 break; 4561 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4562 RC = &AArch64::FPR64RegClass; 4563 Opc = AArch64::FMLSv4i16_indexed; 4564 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4565 FMAInstKind::Indexed); 4566 break; 4567 4568 case MachineCombinerPattern::FMLSv2f32_OP2: 4569 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4570 RC = &AArch64::FPR64RegClass; 4571 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 4572 Opc = AArch64::FMLSv2i32_indexed; 4573 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4574 FMAInstKind::Indexed); 4575 } else { 4576 Opc = AArch64::FMLSv2f32; 4577 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4578 FMAInstKind::Accumulator); 4579 } 4580 break; 4581 4582 case MachineCombinerPattern::FMLSv8f16_OP1: 4583 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 4584 RC = &AArch64::FPR128RegClass; 4585 Register NewVR = MRI.createVirtualRegister(RC); 4586 MachineInstrBuilder MIB1 = 4587 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 4588 .add(Root.getOperand(2)); 4589 InsInstrs.push_back(MIB1); 4590 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4591 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 4592 Opc = AArch64::FMLAv8f16; 4593 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4594 FMAInstKind::Accumulator, &NewVR); 4595 } else { 4596 Opc = AArch64::FMLAv8i16_indexed; 4597 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4598 FMAInstKind::Indexed, &NewVR); 4599 } 4600 break; 4601 } 4602 case MachineCombinerPattern::FMLSv8f16_OP2: 4603 RC = &AArch64::FPR128RegClass; 4604 Opc = AArch64::FMLSv8f16; 4605 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4606 FMAInstKind::Accumulator); 4607 break; 4608 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4609 RC = &AArch64::FPR128RegClass; 4610 Opc = AArch64::FMLSv8i16_indexed; 4611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4612 FMAInstKind::Indexed); 4613 break; 4614 4615 case MachineCombinerPattern::FMLSv2f64_OP2: 4616 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4617 RC = &AArch64::FPR128RegClass; 4618 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 4619 Opc = AArch64::FMLSv2i64_indexed; 4620 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4621 FMAInstKind::Indexed); 4622 } else { 4623 Opc = AArch64::FMLSv2f64; 4624 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4625 FMAInstKind::Accumulator); 4626 } 4627 break; 4628 4629 case MachineCombinerPattern::FMLSv4f32_OP2: 4630 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4631 RC = &AArch64::FPR128RegClass; 4632 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 4633 Opc = AArch64::FMLSv4i32_indexed; 4634 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4635 FMAInstKind::Indexed); 4636 } else { 4637 Opc = AArch64::FMLSv4f32; 4638 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4639 FMAInstKind::Accumulator); 4640 } 4641 break; 4642 case MachineCombinerPattern::FMLSv2f32_OP1: 4643 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 4644 RC = &AArch64::FPR64RegClass; 4645 Register NewVR = MRI.createVirtualRegister(RC); 4646 MachineInstrBuilder MIB1 = 4647 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 4648 .add(Root.getOperand(2)); 4649 InsInstrs.push_back(MIB1); 4650 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4651 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 4652 Opc = AArch64::FMLAv2i32_indexed; 4653 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4654 FMAInstKind::Indexed, &NewVR); 4655 } else { 4656 Opc = AArch64::FMLAv2f32; 4657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4658 FMAInstKind::Accumulator, &NewVR); 4659 } 4660 break; 4661 } 4662 case MachineCombinerPattern::FMLSv4f32_OP1: 4663 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 4664 RC = &AArch64::FPR128RegClass; 4665 Register NewVR = MRI.createVirtualRegister(RC); 4666 MachineInstrBuilder MIB1 = 4667 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 4668 .add(Root.getOperand(2)); 4669 InsInstrs.push_back(MIB1); 4670 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4671 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 4672 Opc = AArch64::FMLAv4i32_indexed; 4673 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4674 FMAInstKind::Indexed, &NewVR); 4675 } else { 4676 Opc = AArch64::FMLAv4f32; 4677 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4678 FMAInstKind::Accumulator, &NewVR); 4679 } 4680 break; 4681 } 4682 case MachineCombinerPattern::FMLSv2f64_OP1: 4683 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 4684 RC = &AArch64::FPR128RegClass; 4685 Register NewVR = MRI.createVirtualRegister(RC); 4686 MachineInstrBuilder MIB1 = 4687 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 4688 .add(Root.getOperand(2)); 4689 InsInstrs.push_back(MIB1); 4690 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4691 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 4692 Opc = AArch64::FMLAv2i64_indexed; 4693 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4694 FMAInstKind::Indexed, &NewVR); 4695 } else { 4696 Opc = AArch64::FMLAv2f64; 4697 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4698 FMAInstKind::Accumulator, &NewVR); 4699 } 4700 break; 4701 } 4702 } // end switch (Pattern) 4703 // Record MUL and ADD/SUB for deletion 4704 DelInstrs.push_back(MUL); 4705 DelInstrs.push_back(&Root); 4706 } 4707 4708 /// Replace csincr-branch sequence by simple conditional branch 4709 /// 4710 /// Examples: 4711 /// 1. \code 4712 /// csinc w9, wzr, wzr, <condition code> 4713 /// tbnz w9, #0, 0x44 4714 /// \endcode 4715 /// to 4716 /// \code 4717 /// b.<inverted condition code> 4718 /// \endcode 4719 /// 4720 /// 2. \code 4721 /// csinc w9, wzr, wzr, <condition code> 4722 /// tbz w9, #0, 0x44 4723 /// \endcode 4724 /// to 4725 /// \code 4726 /// b.<condition code> 4727 /// \endcode 4728 /// 4729 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 4730 /// compare's constant operand is power of 2. 4731 /// 4732 /// Examples: 4733 /// \code 4734 /// and w8, w8, #0x400 4735 /// cbnz w8, L1 4736 /// \endcode 4737 /// to 4738 /// \code 4739 /// tbnz w8, #10, L1 4740 /// \endcode 4741 /// 4742 /// \param MI Conditional Branch 4743 /// \return True when the simple conditional branch is generated 4744 /// 4745 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 4746 bool IsNegativeBranch = false; 4747 bool IsTestAndBranch = false; 4748 unsigned TargetBBInMI = 0; 4749 switch (MI.getOpcode()) { 4750 default: 4751 llvm_unreachable("Unknown branch instruction?"); 4752 case AArch64::Bcc: 4753 return false; 4754 case AArch64::CBZW: 4755 case AArch64::CBZX: 4756 TargetBBInMI = 1; 4757 break; 4758 case AArch64::CBNZW: 4759 case AArch64::CBNZX: 4760 TargetBBInMI = 1; 4761 IsNegativeBranch = true; 4762 break; 4763 case AArch64::TBZW: 4764 case AArch64::TBZX: 4765 TargetBBInMI = 2; 4766 IsTestAndBranch = true; 4767 break; 4768 case AArch64::TBNZW: 4769 case AArch64::TBNZX: 4770 TargetBBInMI = 2; 4771 IsNegativeBranch = true; 4772 IsTestAndBranch = true; 4773 break; 4774 } 4775 // So we increment a zero register and test for bits other 4776 // than bit 0? Conservatively bail out in case the verifier 4777 // missed this case. 4778 if (IsTestAndBranch && MI.getOperand(1).getImm()) 4779 return false; 4780 4781 // Find Definition. 4782 assert(MI.getParent() && "Incomplete machine instruciton\n"); 4783 MachineBasicBlock *MBB = MI.getParent(); 4784 MachineFunction *MF = MBB->getParent(); 4785 MachineRegisterInfo *MRI = &MF->getRegInfo(); 4786 Register VReg = MI.getOperand(0).getReg(); 4787 if (!Register::isVirtualRegister(VReg)) 4788 return false; 4789 4790 MachineInstr *DefMI = MRI->getVRegDef(VReg); 4791 4792 // Look through COPY instructions to find definition. 4793 while (DefMI->isCopy()) { 4794 Register CopyVReg = DefMI->getOperand(1).getReg(); 4795 if (!MRI->hasOneNonDBGUse(CopyVReg)) 4796 return false; 4797 if (!MRI->hasOneDef(CopyVReg)) 4798 return false; 4799 DefMI = MRI->getVRegDef(CopyVReg); 4800 } 4801 4802 switch (DefMI->getOpcode()) { 4803 default: 4804 return false; 4805 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 4806 case AArch64::ANDWri: 4807 case AArch64::ANDXri: { 4808 if (IsTestAndBranch) 4809 return false; 4810 if (DefMI->getParent() != MBB) 4811 return false; 4812 if (!MRI->hasOneNonDBGUse(VReg)) 4813 return false; 4814 4815 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 4816 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 4817 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 4818 if (!isPowerOf2_64(Mask)) 4819 return false; 4820 4821 MachineOperand &MO = DefMI->getOperand(1); 4822 Register NewReg = MO.getReg(); 4823 if (!Register::isVirtualRegister(NewReg)) 4824 return false; 4825 4826 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 4827 4828 MachineBasicBlock &RefToMBB = *MBB; 4829 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 4830 DebugLoc DL = MI.getDebugLoc(); 4831 unsigned Imm = Log2_64(Mask); 4832 unsigned Opc = (Imm < 32) 4833 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 4834 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 4835 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 4836 .addReg(NewReg) 4837 .addImm(Imm) 4838 .addMBB(TBB); 4839 // Register lives on to the CBZ now. 4840 MO.setIsKill(false); 4841 4842 // For immediate smaller than 32, we need to use the 32-bit 4843 // variant (W) in all cases. Indeed the 64-bit variant does not 4844 // allow to encode them. 4845 // Therefore, if the input register is 64-bit, we need to take the 4846 // 32-bit sub-part. 4847 if (!Is32Bit && Imm < 32) 4848 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 4849 MI.eraseFromParent(); 4850 return true; 4851 } 4852 // Look for CSINC 4853 case AArch64::CSINCWr: 4854 case AArch64::CSINCXr: { 4855 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 4856 DefMI->getOperand(2).getReg() == AArch64::WZR) && 4857 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 4858 DefMI->getOperand(2).getReg() == AArch64::XZR)) 4859 return false; 4860 4861 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 4862 return false; 4863 4864 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 4865 // Convert only when the condition code is not modified between 4866 // the CSINC and the branch. The CC may be used by other 4867 // instructions in between. 4868 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 4869 return false; 4870 MachineBasicBlock &RefToMBB = *MBB; 4871 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 4872 DebugLoc DL = MI.getDebugLoc(); 4873 if (IsNegativeBranch) 4874 CC = AArch64CC::getInvertedCondCode(CC); 4875 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 4876 MI.eraseFromParent(); 4877 return true; 4878 } 4879 } 4880 } 4881 4882 std::pair<unsigned, unsigned> 4883 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 4884 const unsigned Mask = AArch64II::MO_FRAGMENT; 4885 return std::make_pair(TF & Mask, TF & ~Mask); 4886 } 4887 4888 ArrayRef<std::pair<unsigned, const char *>> 4889 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 4890 using namespace AArch64II; 4891 4892 static const std::pair<unsigned, const char *> TargetFlags[] = { 4893 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 4894 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 4895 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 4896 {MO_HI12, "aarch64-hi12"}}; 4897 return makeArrayRef(TargetFlags); 4898 } 4899 4900 ArrayRef<std::pair<unsigned, const char *>> 4901 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 4902 using namespace AArch64II; 4903 4904 static const std::pair<unsigned, const char *> TargetFlags[] = { 4905 {MO_COFFSTUB, "aarch64-coffstub"}, 4906 {MO_GOT, "aarch64-got"}, 4907 {MO_NC, "aarch64-nc"}, 4908 {MO_S, "aarch64-s"}, 4909 {MO_TLS, "aarch64-tls"}, 4910 {MO_DLLIMPORT, "aarch64-dllimport"}, 4911 {MO_PREL, "aarch64-prel"}, 4912 {MO_TAGGED, "aarch64-tagged"}}; 4913 return makeArrayRef(TargetFlags); 4914 } 4915 4916 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 4917 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 4918 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 4919 {{MOSuppressPair, "aarch64-suppress-pair"}, 4920 {MOStridedAccess, "aarch64-strided-access"}}; 4921 return makeArrayRef(TargetFlags); 4922 } 4923 4924 /// Constants defining how certain sequences should be outlined. 4925 /// This encompasses how an outlined function should be called, and what kind of 4926 /// frame should be emitted for that outlined function. 4927 /// 4928 /// \p MachineOutlinerDefault implies that the function should be called with 4929 /// a save and restore of LR to the stack. 4930 /// 4931 /// That is, 4932 /// 4933 /// I1 Save LR OUTLINED_FUNCTION: 4934 /// I2 --> BL OUTLINED_FUNCTION I1 4935 /// I3 Restore LR I2 4936 /// I3 4937 /// RET 4938 /// 4939 /// * Call construction overhead: 3 (save + BL + restore) 4940 /// * Frame construction overhead: 1 (ret) 4941 /// * Requires stack fixups? Yes 4942 /// 4943 /// \p MachineOutlinerTailCall implies that the function is being created from 4944 /// a sequence of instructions ending in a return. 4945 /// 4946 /// That is, 4947 /// 4948 /// I1 OUTLINED_FUNCTION: 4949 /// I2 --> B OUTLINED_FUNCTION I1 4950 /// RET I2 4951 /// RET 4952 /// 4953 /// * Call construction overhead: 1 (B) 4954 /// * Frame construction overhead: 0 (Return included in sequence) 4955 /// * Requires stack fixups? No 4956 /// 4957 /// \p MachineOutlinerNoLRSave implies that the function should be called using 4958 /// a BL instruction, but doesn't require LR to be saved and restored. This 4959 /// happens when LR is known to be dead. 4960 /// 4961 /// That is, 4962 /// 4963 /// I1 OUTLINED_FUNCTION: 4964 /// I2 --> BL OUTLINED_FUNCTION I1 4965 /// I3 I2 4966 /// I3 4967 /// RET 4968 /// 4969 /// * Call construction overhead: 1 (BL) 4970 /// * Frame construction overhead: 1 (RET) 4971 /// * Requires stack fixups? No 4972 /// 4973 /// \p MachineOutlinerThunk implies that the function is being created from 4974 /// a sequence of instructions ending in a call. The outlined function is 4975 /// called with a BL instruction, and the outlined function tail-calls the 4976 /// original call destination. 4977 /// 4978 /// That is, 4979 /// 4980 /// I1 OUTLINED_FUNCTION: 4981 /// I2 --> BL OUTLINED_FUNCTION I1 4982 /// BL f I2 4983 /// B f 4984 /// * Call construction overhead: 1 (BL) 4985 /// * Frame construction overhead: 0 4986 /// * Requires stack fixups? No 4987 /// 4988 /// \p MachineOutlinerRegSave implies that the function should be called with a 4989 /// save and restore of LR to an available register. This allows us to avoid 4990 /// stack fixups. Note that this outlining variant is compatible with the 4991 /// NoLRSave case. 4992 /// 4993 /// That is, 4994 /// 4995 /// I1 Save LR OUTLINED_FUNCTION: 4996 /// I2 --> BL OUTLINED_FUNCTION I1 4997 /// I3 Restore LR I2 4998 /// I3 4999 /// RET 5000 /// 5001 /// * Call construction overhead: 3 (save + BL + restore) 5002 /// * Frame construction overhead: 1 (ret) 5003 /// * Requires stack fixups? No 5004 enum MachineOutlinerClass { 5005 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 5006 MachineOutlinerTailCall, /// Only emit a branch. 5007 MachineOutlinerNoLRSave, /// Emit a call and return. 5008 MachineOutlinerThunk, /// Emit a call and tail-call. 5009 MachineOutlinerRegSave /// Same as default, but save to a register. 5010 }; 5011 5012 enum MachineOutlinerMBBFlags { 5013 LRUnavailableSomewhere = 0x2, 5014 HasCalls = 0x4, 5015 UnsafeRegsDead = 0x8 5016 }; 5017 5018 unsigned 5019 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 5020 assert(C.LRUWasSet && "LRU wasn't set?"); 5021 MachineFunction *MF = C.getMF(); 5022 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 5023 MF->getSubtarget().getRegisterInfo()); 5024 5025 // Check if there is an available register across the sequence that we can 5026 // use. 5027 for (unsigned Reg : AArch64::GPR64RegClass) { 5028 if (!ARI->isReservedReg(*MF, Reg) && 5029 Reg != AArch64::LR && // LR is not reserved, but don't use it. 5030 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 5031 Reg != AArch64::X17 && // Ditto for X17. 5032 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 5033 return Reg; 5034 } 5035 5036 // No suitable register. Return 0. 5037 return 0u; 5038 } 5039 5040 outliner::OutlinedFunction 5041 AArch64InstrInfo::getOutliningCandidateInfo( 5042 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 5043 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 5044 unsigned SequenceSize = 5045 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 5046 [this](unsigned Sum, const MachineInstr &MI) { 5047 return Sum + getInstSizeInBytes(MI); 5048 }); 5049 5050 // Properties about candidate MBBs that hold for all of them. 5051 unsigned FlagsSetInAll = 0xF; 5052 5053 // Compute liveness information for each candidate, and set FlagsSetInAll. 5054 const TargetRegisterInfo &TRI = getRegisterInfo(); 5055 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 5056 [&FlagsSetInAll](outliner::Candidate &C) { 5057 FlagsSetInAll &= C.Flags; 5058 }); 5059 5060 // According to the AArch64 Procedure Call Standard, the following are 5061 // undefined on entry/exit from a function call: 5062 // 5063 // * Registers x16, x17, (and thus w16, w17) 5064 // * Condition codes (and thus the NZCV register) 5065 // 5066 // Because if this, we can't outline any sequence of instructions where 5067 // one 5068 // of these registers is live into/across it. Thus, we need to delete 5069 // those 5070 // candidates. 5071 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 5072 // If the unsafe registers in this block are all dead, then we don't need 5073 // to compute liveness here. 5074 if (C.Flags & UnsafeRegsDead) 5075 return false; 5076 C.initLRU(TRI); 5077 LiveRegUnits LRU = C.LRU; 5078 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 5079 !LRU.available(AArch64::NZCV)); 5080 }; 5081 5082 // Are there any candidates where those registers are live? 5083 if (!(FlagsSetInAll & UnsafeRegsDead)) { 5084 // Erase every candidate that violates the restrictions above. (It could be 5085 // true that we have viable candidates, so it's not worth bailing out in 5086 // the case that, say, 1 out of 20 candidates violate the restructions.) 5087 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), 5088 RepeatedSequenceLocs.end(), 5089 CantGuaranteeValueAcrossCall), 5090 RepeatedSequenceLocs.end()); 5091 5092 // If the sequence doesn't have enough candidates left, then we're done. 5093 if (RepeatedSequenceLocs.size() < 2) 5094 return outliner::OutlinedFunction(); 5095 } 5096 5097 // At this point, we have only "safe" candidates to outline. Figure out 5098 // frame + call instruction information. 5099 5100 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 5101 5102 // Helper lambda which sets call information for every candidate. 5103 auto SetCandidateCallInfo = 5104 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 5105 for (outliner::Candidate &C : RepeatedSequenceLocs) 5106 C.setCallInfo(CallID, NumBytesForCall); 5107 }; 5108 5109 unsigned FrameID = MachineOutlinerDefault; 5110 unsigned NumBytesToCreateFrame = 4; 5111 5112 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 5113 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); 5114 }); 5115 5116 // Returns true if an instructions is safe to fix up, false otherwise. 5117 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 5118 if (MI.isCall()) 5119 return true; 5120 5121 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 5122 !MI.readsRegister(AArch64::SP, &TRI)) 5123 return true; 5124 5125 // Any modification of SP will break our code to save/restore LR. 5126 // FIXME: We could handle some instructions which add a constant 5127 // offset to SP, with a bit more work. 5128 if (MI.modifiesRegister(AArch64::SP, &TRI)) 5129 return false; 5130 5131 // At this point, we have a stack instruction that we might need to 5132 // fix up. We'll handle it if it's a load or store. 5133 if (MI.mayLoadOrStore()) { 5134 const MachineOperand *Base; // Filled with the base operand of MI. 5135 int64_t Offset; // Filled with the offset of MI. 5136 5137 // Does it allow us to offset the base operand and is the base the 5138 // register SP? 5139 if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || 5140 Base->getReg() != AArch64::SP) 5141 return false; 5142 5143 // Find the minimum/maximum offset for this instruction and check 5144 // if fixing it up would be in range. 5145 int64_t MinOffset, 5146 MaxOffset; // Unscaled offsets for the instruction. 5147 unsigned Scale; // The scale to multiply the offsets by. 5148 unsigned DummyWidth; 5149 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 5150 5151 Offset += 16; // Update the offset to what it would be if we outlined. 5152 if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) 5153 return false; 5154 5155 // It's in range, so we can outline it. 5156 return true; 5157 } 5158 5159 // FIXME: Add handling for instructions like "add x0, sp, #8". 5160 5161 // We can't fix it up, so don't outline it. 5162 return false; 5163 }; 5164 5165 // True if it's possible to fix up each stack instruction in this sequence. 5166 // Important for frames/call variants that modify the stack. 5167 bool AllStackInstrsSafe = std::all_of( 5168 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 5169 5170 // If the last instruction in any candidate is a terminator, then we should 5171 // tail call all of the candidates. 5172 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 5173 FrameID = MachineOutlinerTailCall; 5174 NumBytesToCreateFrame = 0; 5175 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 5176 } 5177 5178 else if (LastInstrOpcode == AArch64::BL || 5179 (LastInstrOpcode == AArch64::BLR && !HasBTI)) { 5180 // FIXME: Do we need to check if the code after this uses the value of LR? 5181 FrameID = MachineOutlinerThunk; 5182 NumBytesToCreateFrame = 0; 5183 SetCandidateCallInfo(MachineOutlinerThunk, 4); 5184 } 5185 5186 else { 5187 // We need to decide how to emit calls + frames. We can always emit the same 5188 // frame if we don't need to save to the stack. If we have to save to the 5189 // stack, then we need a different frame. 5190 unsigned NumBytesNoStackCalls = 0; 5191 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 5192 5193 for (outliner::Candidate &C : RepeatedSequenceLocs) { 5194 C.initLRU(TRI); 5195 5196 // Is LR available? If so, we don't need a save. 5197 if (C.LRU.available(AArch64::LR)) { 5198 NumBytesNoStackCalls += 4; 5199 C.setCallInfo(MachineOutlinerNoLRSave, 4); 5200 CandidatesWithoutStackFixups.push_back(C); 5201 } 5202 5203 // Is an unused register available? If so, we won't modify the stack, so 5204 // we can outline with the same frame type as those that don't save LR. 5205 else if (findRegisterToSaveLRTo(C)) { 5206 NumBytesNoStackCalls += 12; 5207 C.setCallInfo(MachineOutlinerRegSave, 12); 5208 CandidatesWithoutStackFixups.push_back(C); 5209 } 5210 5211 // Is SP used in the sequence at all? If not, we don't have to modify 5212 // the stack, so we are guaranteed to get the same frame. 5213 else if (C.UsedInSequence.available(AArch64::SP)) { 5214 NumBytesNoStackCalls += 12; 5215 C.setCallInfo(MachineOutlinerDefault, 12); 5216 CandidatesWithoutStackFixups.push_back(C); 5217 } 5218 5219 // If we outline this, we need to modify the stack. Pretend we don't 5220 // outline this by saving all of its bytes. 5221 else { 5222 NumBytesNoStackCalls += SequenceSize; 5223 } 5224 } 5225 5226 // If there are no places where we have to save LR, then note that we 5227 // don't have to update the stack. Otherwise, give every candidate the 5228 // default call type, as long as it's safe to do so. 5229 if (!AllStackInstrsSafe || 5230 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 5231 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 5232 FrameID = MachineOutlinerNoLRSave; 5233 } else { 5234 SetCandidateCallInfo(MachineOutlinerDefault, 12); 5235 } 5236 5237 // If we dropped all of the candidates, bail out here. 5238 if (RepeatedSequenceLocs.size() < 2) { 5239 RepeatedSequenceLocs.clear(); 5240 return outliner::OutlinedFunction(); 5241 } 5242 } 5243 5244 // Does every candidate's MBB contain a call? If so, then we might have a call 5245 // in the range. 5246 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 5247 // Check if the range contains a call. These require a save + restore of the 5248 // link register. 5249 bool ModStackToSaveLR = false; 5250 if (std::any_of(FirstCand.front(), FirstCand.back(), 5251 [](const MachineInstr &MI) { return MI.isCall(); })) 5252 ModStackToSaveLR = true; 5253 5254 // Handle the last instruction separately. If this is a tail call, then the 5255 // last instruction is a call. We don't want to save + restore in this case. 5256 // However, it could be possible that the last instruction is a call without 5257 // it being valid to tail call this sequence. We should consider this as 5258 // well. 5259 else if (FrameID != MachineOutlinerThunk && 5260 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 5261 ModStackToSaveLR = true; 5262 5263 if (ModStackToSaveLR) { 5264 // We can't fix up the stack. Bail out. 5265 if (!AllStackInstrsSafe) { 5266 RepeatedSequenceLocs.clear(); 5267 return outliner::OutlinedFunction(); 5268 } 5269 5270 // Save + restore LR. 5271 NumBytesToCreateFrame += 8; 5272 } 5273 } 5274 5275 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 5276 NumBytesToCreateFrame, FrameID); 5277 } 5278 5279 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 5280 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 5281 const Function &F = MF.getFunction(); 5282 5283 // Can F be deduplicated by the linker? If it can, don't outline from it. 5284 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 5285 return false; 5286 5287 // Don't outline from functions with section markings; the program could 5288 // expect that all the code is in the named section. 5289 // FIXME: Allow outlining from multiple functions with the same section 5290 // marking. 5291 if (F.hasSection()) 5292 return false; 5293 5294 // Outlining from functions with redzones is unsafe since the outliner may 5295 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 5296 // outline from it. 5297 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 5298 if (!AFI || AFI->hasRedZone().getValueOr(true)) 5299 return false; 5300 5301 // It's safe to outline from MF. 5302 return true; 5303 } 5304 5305 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 5306 unsigned &Flags) const { 5307 // Check if LR is available through all of the MBB. If it's not, then set 5308 // a flag. 5309 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 5310 "Suitable Machine Function for outlining must track liveness"); 5311 LiveRegUnits LRU(getRegisterInfo()); 5312 5313 std::for_each(MBB.rbegin(), MBB.rend(), 5314 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 5315 5316 // Check if each of the unsafe registers are available... 5317 bool W16AvailableInBlock = LRU.available(AArch64::W16); 5318 bool W17AvailableInBlock = LRU.available(AArch64::W17); 5319 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 5320 5321 // If all of these are dead (and not live out), we know we don't have to check 5322 // them later. 5323 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 5324 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 5325 5326 // Now, add the live outs to the set. 5327 LRU.addLiveOuts(MBB); 5328 5329 // If any of these registers is available in the MBB, but also a live out of 5330 // the block, then we know outlining is unsafe. 5331 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 5332 return false; 5333 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 5334 return false; 5335 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 5336 return false; 5337 5338 // Check if there's a call inside this MachineBasicBlock. If there is, then 5339 // set a flag. 5340 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 5341 Flags |= MachineOutlinerMBBFlags::HasCalls; 5342 5343 MachineFunction *MF = MBB.getParent(); 5344 5345 // In the event that we outline, we may have to save LR. If there is an 5346 // available register in the MBB, then we'll always save LR there. Check if 5347 // this is true. 5348 bool CanSaveLR = false; 5349 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 5350 MF->getSubtarget().getRegisterInfo()); 5351 5352 // Check if there is an available register across the sequence that we can 5353 // use. 5354 for (unsigned Reg : AArch64::GPR64RegClass) { 5355 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 5356 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 5357 CanSaveLR = true; 5358 break; 5359 } 5360 } 5361 5362 // Check if we have a register we can save LR to, and if LR was used 5363 // somewhere. If both of those things are true, then we need to evaluate the 5364 // safety of outlining stack instructions later. 5365 if (!CanSaveLR && !LRU.available(AArch64::LR)) 5366 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 5367 5368 return true; 5369 } 5370 5371 outliner::InstrType 5372 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 5373 unsigned Flags) const { 5374 MachineInstr &MI = *MIT; 5375 MachineBasicBlock *MBB = MI.getParent(); 5376 MachineFunction *MF = MBB->getParent(); 5377 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 5378 5379 // Don't outline LOHs. 5380 if (FuncInfo->getLOHRelated().count(&MI)) 5381 return outliner::InstrType::Illegal; 5382 5383 // Don't allow debug values to impact outlining type. 5384 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 5385 return outliner::InstrType::Invisible; 5386 5387 // At this point, KILL instructions don't really tell us much so we can go 5388 // ahead and skip over them. 5389 if (MI.isKill()) 5390 return outliner::InstrType::Invisible; 5391 5392 // Is this a terminator for a basic block? 5393 if (MI.isTerminator()) { 5394 5395 // Is this the end of a function? 5396 if (MI.getParent()->succ_empty()) 5397 return outliner::InstrType::Legal; 5398 5399 // It's not, so don't outline it. 5400 return outliner::InstrType::Illegal; 5401 } 5402 5403 // Make sure none of the operands are un-outlinable. 5404 for (const MachineOperand &MOP : MI.operands()) { 5405 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 5406 MOP.isTargetIndex()) 5407 return outliner::InstrType::Illegal; 5408 5409 // If it uses LR or W30 explicitly, then don't touch it. 5410 if (MOP.isReg() && !MOP.isImplicit() && 5411 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 5412 return outliner::InstrType::Illegal; 5413 } 5414 5415 // Special cases for instructions that can always be outlined, but will fail 5416 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 5417 // be outlined because they don't require a *specific* value to be in LR. 5418 if (MI.getOpcode() == AArch64::ADRP) 5419 return outliner::InstrType::Legal; 5420 5421 // If MI is a call we might be able to outline it. We don't want to outline 5422 // any calls that rely on the position of items on the stack. When we outline 5423 // something containing a call, we have to emit a save and restore of LR in 5424 // the outlined function. Currently, this always happens by saving LR to the 5425 // stack. Thus, if we outline, say, half the parameters for a function call 5426 // plus the call, then we'll break the callee's expectations for the layout 5427 // of the stack. 5428 // 5429 // FIXME: Allow calls to functions which construct a stack frame, as long 5430 // as they don't access arguments on the stack. 5431 // FIXME: Figure out some way to analyze functions defined in other modules. 5432 // We should be able to compute the memory usage based on the IR calling 5433 // convention, even if we can't see the definition. 5434 if (MI.isCall()) { 5435 // Get the function associated with the call. Look at each operand and find 5436 // the one that represents the callee and get its name. 5437 const Function *Callee = nullptr; 5438 for (const MachineOperand &MOP : MI.operands()) { 5439 if (MOP.isGlobal()) { 5440 Callee = dyn_cast<Function>(MOP.getGlobal()); 5441 break; 5442 } 5443 } 5444 5445 // Never outline calls to mcount. There isn't any rule that would require 5446 // this, but the Linux kernel's "ftrace" feature depends on it. 5447 if (Callee && Callee->getName() == "\01_mcount") 5448 return outliner::InstrType::Illegal; 5449 5450 // If we don't know anything about the callee, assume it depends on the 5451 // stack layout of the caller. In that case, it's only legal to outline 5452 // as a tail-call. Whitelist the call instructions we know about so we 5453 // don't get unexpected results with call pseudo-instructions. 5454 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 5455 if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) 5456 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 5457 5458 if (!Callee) 5459 return UnknownCallOutlineType; 5460 5461 // We have a function we have information about. Check it if it's something 5462 // can safely outline. 5463 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 5464 5465 // We don't know what's going on with the callee at all. Don't touch it. 5466 if (!CalleeMF) 5467 return UnknownCallOutlineType; 5468 5469 // Check if we know anything about the callee saves on the function. If we 5470 // don't, then don't touch it, since that implies that we haven't 5471 // computed anything about its stack frame yet. 5472 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 5473 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 5474 MFI.getNumObjects() > 0) 5475 return UnknownCallOutlineType; 5476 5477 // At this point, we can say that CalleeMF ought to not pass anything on the 5478 // stack. Therefore, we can outline it. 5479 return outliner::InstrType::Legal; 5480 } 5481 5482 // Don't outline positions. 5483 if (MI.isPosition()) 5484 return outliner::InstrType::Illegal; 5485 5486 // Don't touch the link register or W30. 5487 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 5488 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 5489 return outliner::InstrType::Illegal; 5490 5491 // Don't outline BTI instructions, because that will prevent the outlining 5492 // site from being indirectly callable. 5493 if (MI.getOpcode() == AArch64::HINT) { 5494 int64_t Imm = MI.getOperand(0).getImm(); 5495 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 5496 return outliner::InstrType::Illegal; 5497 } 5498 5499 return outliner::InstrType::Legal; 5500 } 5501 5502 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 5503 for (MachineInstr &MI : MBB) { 5504 const MachineOperand *Base; 5505 unsigned Width; 5506 int64_t Offset; 5507 5508 // Is this a load or store with an immediate offset with SP as the base? 5509 if (!MI.mayLoadOrStore() || 5510 !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || 5511 (Base->isReg() && Base->getReg() != AArch64::SP)) 5512 continue; 5513 5514 // It is, so we have to fix it up. 5515 unsigned Scale; 5516 int64_t Dummy1, Dummy2; 5517 5518 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 5519 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 5520 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 5521 assert(Scale != 0 && "Unexpected opcode!"); 5522 5523 // We've pushed the return address to the stack, so add 16 to the offset. 5524 // This is safe, since we already checked if it would overflow when we 5525 // checked if this instruction was legal to outline. 5526 int64_t NewImm = (Offset + 16) / Scale; 5527 StackOffsetOperand.setImm(NewImm); 5528 } 5529 } 5530 5531 void AArch64InstrInfo::buildOutlinedFrame( 5532 MachineBasicBlock &MBB, MachineFunction &MF, 5533 const outliner::OutlinedFunction &OF) const { 5534 // For thunk outlining, rewrite the last instruction from a call to a 5535 // tail-call. 5536 if (OF.FrameConstructionID == MachineOutlinerThunk) { 5537 MachineInstr *Call = &*--MBB.instr_end(); 5538 unsigned TailOpcode; 5539 if (Call->getOpcode() == AArch64::BL) { 5540 TailOpcode = AArch64::TCRETURNdi; 5541 } else { 5542 assert(Call->getOpcode() == AArch64::BLR); 5543 TailOpcode = AArch64::TCRETURNriALL; 5544 } 5545 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 5546 .add(Call->getOperand(0)) 5547 .addImm(0); 5548 MBB.insert(MBB.end(), TC); 5549 Call->eraseFromParent(); 5550 } 5551 5552 // Is there a call in the outlined range? 5553 auto IsNonTailCall = [](MachineInstr &MI) { 5554 return MI.isCall() && !MI.isReturn(); 5555 }; 5556 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { 5557 // Fix up the instructions in the range, since we're going to modify the 5558 // stack. 5559 assert(OF.FrameConstructionID != MachineOutlinerDefault && 5560 "Can only fix up stack references once"); 5561 fixupPostOutline(MBB); 5562 5563 // LR has to be a live in so that we can save it. 5564 MBB.addLiveIn(AArch64::LR); 5565 5566 MachineBasicBlock::iterator It = MBB.begin(); 5567 MachineBasicBlock::iterator Et = MBB.end(); 5568 5569 if (OF.FrameConstructionID == MachineOutlinerTailCall || 5570 OF.FrameConstructionID == MachineOutlinerThunk) 5571 Et = std::prev(MBB.end()); 5572 5573 // Insert a save before the outlined region 5574 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 5575 .addReg(AArch64::SP, RegState::Define) 5576 .addReg(AArch64::LR) 5577 .addReg(AArch64::SP) 5578 .addImm(-16); 5579 It = MBB.insert(It, STRXpre); 5580 5581 const TargetSubtargetInfo &STI = MF.getSubtarget(); 5582 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 5583 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 5584 5585 // Add a CFI saying the stack was moved 16 B down. 5586 int64_t StackPosEntry = 5587 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); 5588 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 5589 .addCFIIndex(StackPosEntry) 5590 .setMIFlags(MachineInstr::FrameSetup); 5591 5592 // Add a CFI saying that the LR that we want to find is now 16 B higher than 5593 // before. 5594 int64_t LRPosEntry = 5595 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); 5596 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 5597 .addCFIIndex(LRPosEntry) 5598 .setMIFlags(MachineInstr::FrameSetup); 5599 5600 // Insert a restore before the terminator for the function. 5601 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 5602 .addReg(AArch64::SP, RegState::Define) 5603 .addReg(AArch64::LR, RegState::Define) 5604 .addReg(AArch64::SP) 5605 .addImm(16); 5606 Et = MBB.insert(Et, LDRXpost); 5607 } 5608 5609 // If this is a tail call outlined function, then there's already a return. 5610 if (OF.FrameConstructionID == MachineOutlinerTailCall || 5611 OF.FrameConstructionID == MachineOutlinerThunk) 5612 return; 5613 5614 // It's not a tail call, so we have to insert the return ourselves. 5615 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 5616 .addReg(AArch64::LR, RegState::Undef); 5617 MBB.insert(MBB.end(), ret); 5618 5619 // Did we have to modify the stack by saving the link register? 5620 if (OF.FrameConstructionID != MachineOutlinerDefault) 5621 return; 5622 5623 // We modified the stack. 5624 // Walk over the basic block and fix up all the stack accesses. 5625 fixupPostOutline(MBB); 5626 } 5627 5628 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 5629 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 5630 MachineFunction &MF, const outliner::Candidate &C) const { 5631 5632 // Are we tail calling? 5633 if (C.CallConstructionID == MachineOutlinerTailCall) { 5634 // If yes, then we can just branch to the label. 5635 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 5636 .addGlobalAddress(M.getNamedValue(MF.getName())) 5637 .addImm(0)); 5638 return It; 5639 } 5640 5641 // Are we saving the link register? 5642 if (C.CallConstructionID == MachineOutlinerNoLRSave || 5643 C.CallConstructionID == MachineOutlinerThunk) { 5644 // No, so just insert the call. 5645 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 5646 .addGlobalAddress(M.getNamedValue(MF.getName()))); 5647 return It; 5648 } 5649 5650 // We want to return the spot where we inserted the call. 5651 MachineBasicBlock::iterator CallPt; 5652 5653 // Instructions for saving and restoring LR around the call instruction we're 5654 // going to insert. 5655 MachineInstr *Save; 5656 MachineInstr *Restore; 5657 // Can we save to a register? 5658 if (C.CallConstructionID == MachineOutlinerRegSave) { 5659 // FIXME: This logic should be sunk into a target-specific interface so that 5660 // we don't have to recompute the register. 5661 unsigned Reg = findRegisterToSaveLRTo(C); 5662 assert(Reg != 0 && "No callee-saved register available?"); 5663 5664 // Save and restore LR from that register. 5665 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 5666 .addReg(AArch64::XZR) 5667 .addReg(AArch64::LR) 5668 .addImm(0); 5669 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 5670 .addReg(AArch64::XZR) 5671 .addReg(Reg) 5672 .addImm(0); 5673 } else { 5674 // We have the default case. Save and restore from SP. 5675 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 5676 .addReg(AArch64::SP, RegState::Define) 5677 .addReg(AArch64::LR) 5678 .addReg(AArch64::SP) 5679 .addImm(-16); 5680 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 5681 .addReg(AArch64::SP, RegState::Define) 5682 .addReg(AArch64::LR, RegState::Define) 5683 .addReg(AArch64::SP) 5684 .addImm(16); 5685 } 5686 5687 It = MBB.insert(It, Save); 5688 It++; 5689 5690 // Insert the call. 5691 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 5692 .addGlobalAddress(M.getNamedValue(MF.getName()))); 5693 CallPt = It; 5694 It++; 5695 5696 It = MBB.insert(It, Restore); 5697 return CallPt; 5698 } 5699 5700 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 5701 MachineFunction &MF) const { 5702 return MF.getFunction().hasMinSize(); 5703 } 5704 5705 bool AArch64InstrInfo::isCopyInstrImpl( 5706 const MachineInstr &MI, const MachineOperand *&Source, 5707 const MachineOperand *&Destination) const { 5708 5709 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 5710 // and zero immediate operands used as an alias for mov instruction. 5711 if (MI.getOpcode() == AArch64::ORRWrs && 5712 MI.getOperand(1).getReg() == AArch64::WZR && 5713 MI.getOperand(3).getImm() == 0x0) { 5714 Destination = &MI.getOperand(0); 5715 Source = &MI.getOperand(2); 5716 return true; 5717 } 5718 5719 if (MI.getOpcode() == AArch64::ORRXrs && 5720 MI.getOperand(1).getReg() == AArch64::XZR && 5721 MI.getOperand(3).getImm() == 0x0) { 5722 Destination = &MI.getOperand(0); 5723 Source = &MI.getOperand(2); 5724 return true; 5725 } 5726 5727 return false; 5728 } 5729 5730 #define GET_INSTRINFO_HELPERS 5731 #include "AArch64GenInstrInfo.inc" 5732