1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineOperand.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 #include "llvm/CodeGen/MachineModuleInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstrDesc.h" 39 #include "llvm/Support/Casting.h" 40 #include "llvm/Support/CodeGen.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/Compiler.h" 43 #include "llvm/Support/ErrorHandling.h" 44 #include "llvm/Support/MathExtras.h" 45 #include "llvm/Target/TargetMachine.h" 46 #include "llvm/Target/TargetOptions.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <iterator> 50 #include <utility> 51 52 using namespace llvm; 53 54 #define GET_INSTRINFO_CTOR_DTOR 55 #include "AArch64GenInstrInfo.inc" 56 57 static cl::opt<unsigned> TBZDisplacementBits( 58 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 59 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 60 61 static cl::opt<unsigned> CBZDisplacementBits( 62 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 63 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 64 65 static cl::opt<unsigned> 66 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 67 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 68 69 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 70 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 71 AArch64::CATCHRET), 72 RI(STI.getTargetTriple()), Subtarget(STI) {} 73 74 /// GetInstSize - Return the number of bytes of code the specified 75 /// instruction may be. This returns the maximum number of bytes. 76 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 77 const MachineBasicBlock &MBB = *MI.getParent(); 78 const MachineFunction *MF = MBB.getParent(); 79 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 80 81 { 82 auto Op = MI.getOpcode(); 83 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 84 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 85 } 86 87 // Meta-instructions emit no code. 88 if (MI.isMetaInstruction()) 89 return 0; 90 91 // FIXME: We currently only handle pseudoinstructions that don't get expanded 92 // before the assembly printer. 93 unsigned NumBytes = 0; 94 const MCInstrDesc &Desc = MI.getDesc(); 95 switch (Desc.getOpcode()) { 96 default: 97 // Anything not explicitly designated otherwise is a normal 4-byte insn. 98 NumBytes = 4; 99 break; 100 case TargetOpcode::STACKMAP: 101 // The upper bound for a stackmap intrinsic is the full length of its shadow 102 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 103 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 104 break; 105 case TargetOpcode::PATCHPOINT: 106 // The size of the patchpoint intrinsic is the number of bytes requested 107 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 108 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 109 break; 110 case AArch64::TLSDESC_CALLSEQ: 111 // This gets lowered to an instruction sequence which takes 16 bytes 112 NumBytes = 16; 113 break; 114 case AArch64::JumpTableDest32: 115 case AArch64::JumpTableDest16: 116 case AArch64::JumpTableDest8: 117 NumBytes = 12; 118 break; 119 case AArch64::SPACE: 120 NumBytes = MI.getOperand(1).getImm(); 121 break; 122 } 123 124 return NumBytes; 125 } 126 127 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 128 SmallVectorImpl<MachineOperand> &Cond) { 129 // Block ends with fall-through condbranch. 130 switch (LastInst->getOpcode()) { 131 default: 132 llvm_unreachable("Unknown branch instruction?"); 133 case AArch64::Bcc: 134 Target = LastInst->getOperand(1).getMBB(); 135 Cond.push_back(LastInst->getOperand(0)); 136 break; 137 case AArch64::CBZW: 138 case AArch64::CBZX: 139 case AArch64::CBNZW: 140 case AArch64::CBNZX: 141 Target = LastInst->getOperand(1).getMBB(); 142 Cond.push_back(MachineOperand::CreateImm(-1)); 143 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 144 Cond.push_back(LastInst->getOperand(0)); 145 break; 146 case AArch64::TBZW: 147 case AArch64::TBZX: 148 case AArch64::TBNZW: 149 case AArch64::TBNZX: 150 Target = LastInst->getOperand(2).getMBB(); 151 Cond.push_back(MachineOperand::CreateImm(-1)); 152 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 153 Cond.push_back(LastInst->getOperand(0)); 154 Cond.push_back(LastInst->getOperand(1)); 155 } 156 } 157 158 static unsigned getBranchDisplacementBits(unsigned Opc) { 159 switch (Opc) { 160 default: 161 llvm_unreachable("unexpected opcode!"); 162 case AArch64::B: 163 return 64; 164 case AArch64::TBNZW: 165 case AArch64::TBZW: 166 case AArch64::TBNZX: 167 case AArch64::TBZX: 168 return TBZDisplacementBits; 169 case AArch64::CBNZW: 170 case AArch64::CBZW: 171 case AArch64::CBNZX: 172 case AArch64::CBZX: 173 return CBZDisplacementBits; 174 case AArch64::Bcc: 175 return BCCDisplacementBits; 176 } 177 } 178 179 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 180 int64_t BrOffset) const { 181 unsigned Bits = getBranchDisplacementBits(BranchOp); 182 assert(Bits >= 3 && "max branch displacement must be enough to jump" 183 "over conditional branch expansion"); 184 return isIntN(Bits, BrOffset / 4); 185 } 186 187 MachineBasicBlock * 188 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 189 switch (MI.getOpcode()) { 190 default: 191 llvm_unreachable("unexpected opcode!"); 192 case AArch64::B: 193 return MI.getOperand(0).getMBB(); 194 case AArch64::TBZW: 195 case AArch64::TBNZW: 196 case AArch64::TBZX: 197 case AArch64::TBNZX: 198 return MI.getOperand(2).getMBB(); 199 case AArch64::CBZW: 200 case AArch64::CBNZW: 201 case AArch64::CBZX: 202 case AArch64::CBNZX: 203 case AArch64::Bcc: 204 return MI.getOperand(1).getMBB(); 205 } 206 } 207 208 // Branch analysis. 209 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 210 MachineBasicBlock *&TBB, 211 MachineBasicBlock *&FBB, 212 SmallVectorImpl<MachineOperand> &Cond, 213 bool AllowModify) const { 214 // If the block has no terminators, it just falls into the block after it. 215 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 216 if (I == MBB.end()) 217 return false; 218 219 if (!isUnpredicatedTerminator(*I)) 220 return false; 221 222 // Get the last instruction in the block. 223 MachineInstr *LastInst = &*I; 224 225 // If there is only one terminator instruction, process it. 226 unsigned LastOpc = LastInst->getOpcode(); 227 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 228 if (isUncondBranchOpcode(LastOpc)) { 229 TBB = LastInst->getOperand(0).getMBB(); 230 return false; 231 } 232 if (isCondBranchOpcode(LastOpc)) { 233 // Block ends with fall-through condbranch. 234 parseCondBranch(LastInst, TBB, Cond); 235 return false; 236 } 237 return true; // Can't handle indirect branch. 238 } 239 240 // Get the instruction before it if it is a terminator. 241 MachineInstr *SecondLastInst = &*I; 242 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 243 244 // If AllowModify is true and the block ends with two or more unconditional 245 // branches, delete all but the first unconditional branch. 246 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 247 while (isUncondBranchOpcode(SecondLastOpc)) { 248 LastInst->eraseFromParent(); 249 LastInst = SecondLastInst; 250 LastOpc = LastInst->getOpcode(); 251 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 252 // Return now the only terminator is an unconditional branch. 253 TBB = LastInst->getOperand(0).getMBB(); 254 return false; 255 } else { 256 SecondLastInst = &*I; 257 SecondLastOpc = SecondLastInst->getOpcode(); 258 } 259 } 260 } 261 262 // If there are three terminators, we don't know what sort of block this is. 263 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 264 return true; 265 266 // If the block ends with a B and a Bcc, handle it. 267 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 268 parseCondBranch(SecondLastInst, TBB, Cond); 269 FBB = LastInst->getOperand(0).getMBB(); 270 return false; 271 } 272 273 // If the block ends with two unconditional branches, handle it. The second 274 // one is not executed, so remove it. 275 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 276 TBB = SecondLastInst->getOperand(0).getMBB(); 277 I = LastInst; 278 if (AllowModify) 279 I->eraseFromParent(); 280 return false; 281 } 282 283 // ...likewise if it ends with an indirect branch followed by an unconditional 284 // branch. 285 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 286 I = LastInst; 287 if (AllowModify) 288 I->eraseFromParent(); 289 return true; 290 } 291 292 // Otherwise, can't handle this. 293 return true; 294 } 295 296 bool AArch64InstrInfo::reverseBranchCondition( 297 SmallVectorImpl<MachineOperand> &Cond) const { 298 if (Cond[0].getImm() != -1) { 299 // Regular Bcc 300 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 301 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 302 } else { 303 // Folded compare-and-branch 304 switch (Cond[1].getImm()) { 305 default: 306 llvm_unreachable("Unknown conditional branch!"); 307 case AArch64::CBZW: 308 Cond[1].setImm(AArch64::CBNZW); 309 break; 310 case AArch64::CBNZW: 311 Cond[1].setImm(AArch64::CBZW); 312 break; 313 case AArch64::CBZX: 314 Cond[1].setImm(AArch64::CBNZX); 315 break; 316 case AArch64::CBNZX: 317 Cond[1].setImm(AArch64::CBZX); 318 break; 319 case AArch64::TBZW: 320 Cond[1].setImm(AArch64::TBNZW); 321 break; 322 case AArch64::TBNZW: 323 Cond[1].setImm(AArch64::TBZW); 324 break; 325 case AArch64::TBZX: 326 Cond[1].setImm(AArch64::TBNZX); 327 break; 328 case AArch64::TBNZX: 329 Cond[1].setImm(AArch64::TBZX); 330 break; 331 } 332 } 333 334 return false; 335 } 336 337 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 338 int *BytesRemoved) const { 339 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 340 if (I == MBB.end()) 341 return 0; 342 343 if (!isUncondBranchOpcode(I->getOpcode()) && 344 !isCondBranchOpcode(I->getOpcode())) 345 return 0; 346 347 // Remove the branch. 348 I->eraseFromParent(); 349 350 I = MBB.end(); 351 352 if (I == MBB.begin()) { 353 if (BytesRemoved) 354 *BytesRemoved = 4; 355 return 1; 356 } 357 --I; 358 if (!isCondBranchOpcode(I->getOpcode())) { 359 if (BytesRemoved) 360 *BytesRemoved = 4; 361 return 1; 362 } 363 364 // Remove the branch. 365 I->eraseFromParent(); 366 if (BytesRemoved) 367 *BytesRemoved = 8; 368 369 return 2; 370 } 371 372 void AArch64InstrInfo::instantiateCondBranch( 373 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 374 ArrayRef<MachineOperand> Cond) const { 375 if (Cond[0].getImm() != -1) { 376 // Regular Bcc 377 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 378 } else { 379 // Folded compare-and-branch 380 // Note that we use addOperand instead of addReg to keep the flags. 381 const MachineInstrBuilder MIB = 382 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 383 if (Cond.size() > 3) 384 MIB.addImm(Cond[3].getImm()); 385 MIB.addMBB(TBB); 386 } 387 } 388 389 unsigned AArch64InstrInfo::insertBranch( 390 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 391 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 392 // Shouldn't be a fall through. 393 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 394 395 if (!FBB) { 396 if (Cond.empty()) // Unconditional branch? 397 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 398 else 399 instantiateCondBranch(MBB, DL, TBB, Cond); 400 401 if (BytesAdded) 402 *BytesAdded = 4; 403 404 return 1; 405 } 406 407 // Two-way conditional branch. 408 instantiateCondBranch(MBB, DL, TBB, Cond); 409 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 410 411 if (BytesAdded) 412 *BytesAdded = 8; 413 414 return 2; 415 } 416 417 // Find the original register that VReg is copied from. 418 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 419 while (Register::isVirtualRegister(VReg)) { 420 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 421 if (!DefMI->isFullCopy()) 422 return VReg; 423 VReg = DefMI->getOperand(1).getReg(); 424 } 425 return VReg; 426 } 427 428 // Determine if VReg is defined by an instruction that can be folded into a 429 // csel instruction. If so, return the folded opcode, and the replacement 430 // register. 431 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 432 unsigned *NewVReg = nullptr) { 433 VReg = removeCopies(MRI, VReg); 434 if (!Register::isVirtualRegister(VReg)) 435 return 0; 436 437 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 438 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 439 unsigned Opc = 0; 440 unsigned SrcOpNum = 0; 441 switch (DefMI->getOpcode()) { 442 case AArch64::ADDSXri: 443 case AArch64::ADDSWri: 444 // if NZCV is used, do not fold. 445 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 446 return 0; 447 // fall-through to ADDXri and ADDWri. 448 LLVM_FALLTHROUGH; 449 case AArch64::ADDXri: 450 case AArch64::ADDWri: 451 // add x, 1 -> csinc. 452 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 453 DefMI->getOperand(3).getImm() != 0) 454 return 0; 455 SrcOpNum = 1; 456 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 457 break; 458 459 case AArch64::ORNXrr: 460 case AArch64::ORNWrr: { 461 // not x -> csinv, represented as orn dst, xzr, src. 462 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 463 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 464 return 0; 465 SrcOpNum = 2; 466 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 467 break; 468 } 469 470 case AArch64::SUBSXrr: 471 case AArch64::SUBSWrr: 472 // if NZCV is used, do not fold. 473 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 474 return 0; 475 // fall-through to SUBXrr and SUBWrr. 476 LLVM_FALLTHROUGH; 477 case AArch64::SUBXrr: 478 case AArch64::SUBWrr: { 479 // neg x -> csneg, represented as sub dst, xzr, src. 480 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 481 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 482 return 0; 483 SrcOpNum = 2; 484 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 485 break; 486 } 487 default: 488 return 0; 489 } 490 assert(Opc && SrcOpNum && "Missing parameters"); 491 492 if (NewVReg) 493 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 494 return Opc; 495 } 496 497 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 498 ArrayRef<MachineOperand> Cond, 499 unsigned TrueReg, unsigned FalseReg, 500 int &CondCycles, int &TrueCycles, 501 int &FalseCycles) const { 502 // Check register classes. 503 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 504 const TargetRegisterClass *RC = 505 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 506 if (!RC) 507 return false; 508 509 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 510 unsigned ExtraCondLat = Cond.size() != 1; 511 512 // GPRs are handled by csel. 513 // FIXME: Fold in x+1, -x, and ~x when applicable. 514 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 515 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 516 // Single-cycle csel, csinc, csinv, and csneg. 517 CondCycles = 1 + ExtraCondLat; 518 TrueCycles = FalseCycles = 1; 519 if (canFoldIntoCSel(MRI, TrueReg)) 520 TrueCycles = 0; 521 else if (canFoldIntoCSel(MRI, FalseReg)) 522 FalseCycles = 0; 523 return true; 524 } 525 526 // Scalar floating point is handled by fcsel. 527 // FIXME: Form fabs, fmin, and fmax when applicable. 528 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 529 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 530 CondCycles = 5 + ExtraCondLat; 531 TrueCycles = FalseCycles = 2; 532 return true; 533 } 534 535 // Can't do vectors. 536 return false; 537 } 538 539 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 540 MachineBasicBlock::iterator I, 541 const DebugLoc &DL, unsigned DstReg, 542 ArrayRef<MachineOperand> Cond, 543 unsigned TrueReg, unsigned FalseReg) const { 544 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 545 546 // Parse the condition code, see parseCondBranch() above. 547 AArch64CC::CondCode CC; 548 switch (Cond.size()) { 549 default: 550 llvm_unreachable("Unknown condition opcode in Cond"); 551 case 1: // b.cc 552 CC = AArch64CC::CondCode(Cond[0].getImm()); 553 break; 554 case 3: { // cbz/cbnz 555 // We must insert a compare against 0. 556 bool Is64Bit; 557 switch (Cond[1].getImm()) { 558 default: 559 llvm_unreachable("Unknown branch opcode in Cond"); 560 case AArch64::CBZW: 561 Is64Bit = false; 562 CC = AArch64CC::EQ; 563 break; 564 case AArch64::CBZX: 565 Is64Bit = true; 566 CC = AArch64CC::EQ; 567 break; 568 case AArch64::CBNZW: 569 Is64Bit = false; 570 CC = AArch64CC::NE; 571 break; 572 case AArch64::CBNZX: 573 Is64Bit = true; 574 CC = AArch64CC::NE; 575 break; 576 } 577 Register SrcReg = Cond[2].getReg(); 578 if (Is64Bit) { 579 // cmp reg, #0 is actually subs xzr, reg, #0. 580 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 581 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 582 .addReg(SrcReg) 583 .addImm(0) 584 .addImm(0); 585 } else { 586 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 587 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 588 .addReg(SrcReg) 589 .addImm(0) 590 .addImm(0); 591 } 592 break; 593 } 594 case 4: { // tbz/tbnz 595 // We must insert a tst instruction. 596 switch (Cond[1].getImm()) { 597 default: 598 llvm_unreachable("Unknown branch opcode in Cond"); 599 case AArch64::TBZW: 600 case AArch64::TBZX: 601 CC = AArch64CC::EQ; 602 break; 603 case AArch64::TBNZW: 604 case AArch64::TBNZX: 605 CC = AArch64CC::NE; 606 break; 607 } 608 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 609 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 610 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 611 .addReg(Cond[2].getReg()) 612 .addImm( 613 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 614 else 615 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 616 .addReg(Cond[2].getReg()) 617 .addImm( 618 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 619 break; 620 } 621 } 622 623 unsigned Opc = 0; 624 const TargetRegisterClass *RC = nullptr; 625 bool TryFold = false; 626 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 627 RC = &AArch64::GPR64RegClass; 628 Opc = AArch64::CSELXr; 629 TryFold = true; 630 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 631 RC = &AArch64::GPR32RegClass; 632 Opc = AArch64::CSELWr; 633 TryFold = true; 634 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 635 RC = &AArch64::FPR64RegClass; 636 Opc = AArch64::FCSELDrrr; 637 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 638 RC = &AArch64::FPR32RegClass; 639 Opc = AArch64::FCSELSrrr; 640 } 641 assert(RC && "Unsupported regclass"); 642 643 // Try folding simple instructions into the csel. 644 if (TryFold) { 645 unsigned NewVReg = 0; 646 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 647 if (FoldedOpc) { 648 // The folded opcodes csinc, csinc and csneg apply the operation to 649 // FalseReg, so we need to invert the condition. 650 CC = AArch64CC::getInvertedCondCode(CC); 651 TrueReg = FalseReg; 652 } else 653 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 654 655 // Fold the operation. Leave any dead instructions for DCE to clean up. 656 if (FoldedOpc) { 657 FalseReg = NewVReg; 658 Opc = FoldedOpc; 659 // The extends the live range of NewVReg. 660 MRI.clearKillFlags(NewVReg); 661 } 662 } 663 664 // Pull all virtual register into the appropriate class. 665 MRI.constrainRegClass(TrueReg, RC); 666 MRI.constrainRegClass(FalseReg, RC); 667 668 // Insert the csel. 669 BuildMI(MBB, I, DL, get(Opc), DstReg) 670 .addReg(TrueReg) 671 .addReg(FalseReg) 672 .addImm(CC); 673 } 674 675 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 676 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 677 uint64_t Imm = MI.getOperand(1).getImm(); 678 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 679 uint64_t Encoding; 680 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 681 } 682 683 // FIXME: this implementation should be micro-architecture dependent, so a 684 // micro-architecture target hook should be introduced here in future. 685 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 686 if (!Subtarget.hasCustomCheapAsMoveHandling()) 687 return MI.isAsCheapAsAMove(); 688 689 const unsigned Opcode = MI.getOpcode(); 690 691 // Firstly, check cases gated by features. 692 693 if (Subtarget.hasZeroCycleZeroingFP()) { 694 if (Opcode == AArch64::FMOVH0 || 695 Opcode == AArch64::FMOVS0 || 696 Opcode == AArch64::FMOVD0) 697 return true; 698 } 699 700 if (Subtarget.hasZeroCycleZeroingGP()) { 701 if (Opcode == TargetOpcode::COPY && 702 (MI.getOperand(1).getReg() == AArch64::WZR || 703 MI.getOperand(1).getReg() == AArch64::XZR)) 704 return true; 705 } 706 707 // Secondly, check cases specific to sub-targets. 708 709 if (Subtarget.hasExynosCheapAsMoveHandling()) { 710 if (isExynosCheapAsMove(MI)) 711 return true; 712 713 return MI.isAsCheapAsAMove(); 714 } 715 716 // Finally, check generic cases. 717 718 switch (Opcode) { 719 default: 720 return false; 721 722 // add/sub on register without shift 723 case AArch64::ADDWri: 724 case AArch64::ADDXri: 725 case AArch64::SUBWri: 726 case AArch64::SUBXri: 727 return (MI.getOperand(3).getImm() == 0); 728 729 // logical ops on immediate 730 case AArch64::ANDWri: 731 case AArch64::ANDXri: 732 case AArch64::EORWri: 733 case AArch64::EORXri: 734 case AArch64::ORRWri: 735 case AArch64::ORRXri: 736 return true; 737 738 // logical ops on register without shift 739 case AArch64::ANDWrr: 740 case AArch64::ANDXrr: 741 case AArch64::BICWrr: 742 case AArch64::BICXrr: 743 case AArch64::EONWrr: 744 case AArch64::EONXrr: 745 case AArch64::EORWrr: 746 case AArch64::EORXrr: 747 case AArch64::ORNWrr: 748 case AArch64::ORNXrr: 749 case AArch64::ORRWrr: 750 case AArch64::ORRXrr: 751 return true; 752 753 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 754 // ORRXri, it is as cheap as MOV 755 case AArch64::MOVi32imm: 756 return canBeExpandedToORR(MI, 32); 757 case AArch64::MOVi64imm: 758 return canBeExpandedToORR(MI, 64); 759 } 760 761 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 762 } 763 764 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 765 switch (MI.getOpcode()) { 766 default: 767 return false; 768 769 case AArch64::ADDWrs: 770 case AArch64::ADDXrs: 771 case AArch64::ADDSWrs: 772 case AArch64::ADDSXrs: { 773 unsigned Imm = MI.getOperand(3).getImm(); 774 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 775 if (ShiftVal == 0) 776 return true; 777 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 778 } 779 780 case AArch64::ADDWrx: 781 case AArch64::ADDXrx: 782 case AArch64::ADDXrx64: 783 case AArch64::ADDSWrx: 784 case AArch64::ADDSXrx: 785 case AArch64::ADDSXrx64: { 786 unsigned Imm = MI.getOperand(3).getImm(); 787 switch (AArch64_AM::getArithExtendType(Imm)) { 788 default: 789 return false; 790 case AArch64_AM::UXTB: 791 case AArch64_AM::UXTH: 792 case AArch64_AM::UXTW: 793 case AArch64_AM::UXTX: 794 return AArch64_AM::getArithShiftValue(Imm) <= 4; 795 } 796 } 797 798 case AArch64::SUBWrs: 799 case AArch64::SUBSWrs: { 800 unsigned Imm = MI.getOperand(3).getImm(); 801 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 802 return ShiftVal == 0 || 803 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 804 } 805 806 case AArch64::SUBXrs: 807 case AArch64::SUBSXrs: { 808 unsigned Imm = MI.getOperand(3).getImm(); 809 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 810 return ShiftVal == 0 || 811 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 812 } 813 814 case AArch64::SUBWrx: 815 case AArch64::SUBXrx: 816 case AArch64::SUBXrx64: 817 case AArch64::SUBSWrx: 818 case AArch64::SUBSXrx: 819 case AArch64::SUBSXrx64: { 820 unsigned Imm = MI.getOperand(3).getImm(); 821 switch (AArch64_AM::getArithExtendType(Imm)) { 822 default: 823 return false; 824 case AArch64_AM::UXTB: 825 case AArch64_AM::UXTH: 826 case AArch64_AM::UXTW: 827 case AArch64_AM::UXTX: 828 return AArch64_AM::getArithShiftValue(Imm) == 0; 829 } 830 } 831 832 case AArch64::LDRBBroW: 833 case AArch64::LDRBBroX: 834 case AArch64::LDRBroW: 835 case AArch64::LDRBroX: 836 case AArch64::LDRDroW: 837 case AArch64::LDRDroX: 838 case AArch64::LDRHHroW: 839 case AArch64::LDRHHroX: 840 case AArch64::LDRHroW: 841 case AArch64::LDRHroX: 842 case AArch64::LDRQroW: 843 case AArch64::LDRQroX: 844 case AArch64::LDRSBWroW: 845 case AArch64::LDRSBWroX: 846 case AArch64::LDRSBXroW: 847 case AArch64::LDRSBXroX: 848 case AArch64::LDRSHWroW: 849 case AArch64::LDRSHWroX: 850 case AArch64::LDRSHXroW: 851 case AArch64::LDRSHXroX: 852 case AArch64::LDRSWroW: 853 case AArch64::LDRSWroX: 854 case AArch64::LDRSroW: 855 case AArch64::LDRSroX: 856 case AArch64::LDRWroW: 857 case AArch64::LDRWroX: 858 case AArch64::LDRXroW: 859 case AArch64::LDRXroX: 860 case AArch64::PRFMroW: 861 case AArch64::PRFMroX: 862 case AArch64::STRBBroW: 863 case AArch64::STRBBroX: 864 case AArch64::STRBroW: 865 case AArch64::STRBroX: 866 case AArch64::STRDroW: 867 case AArch64::STRDroX: 868 case AArch64::STRHHroW: 869 case AArch64::STRHHroX: 870 case AArch64::STRHroW: 871 case AArch64::STRHroX: 872 case AArch64::STRQroW: 873 case AArch64::STRQroX: 874 case AArch64::STRSroW: 875 case AArch64::STRSroX: 876 case AArch64::STRWroW: 877 case AArch64::STRWroX: 878 case AArch64::STRXroW: 879 case AArch64::STRXroX: { 880 unsigned IsSigned = MI.getOperand(3).getImm(); 881 return !IsSigned; 882 } 883 } 884 } 885 886 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 887 unsigned Opc = MI.getOpcode(); 888 switch (Opc) { 889 default: 890 return false; 891 case AArch64::SEH_StackAlloc: 892 case AArch64::SEH_SaveFPLR: 893 case AArch64::SEH_SaveFPLR_X: 894 case AArch64::SEH_SaveReg: 895 case AArch64::SEH_SaveReg_X: 896 case AArch64::SEH_SaveRegP: 897 case AArch64::SEH_SaveRegP_X: 898 case AArch64::SEH_SaveFReg: 899 case AArch64::SEH_SaveFReg_X: 900 case AArch64::SEH_SaveFRegP: 901 case AArch64::SEH_SaveFRegP_X: 902 case AArch64::SEH_SetFP: 903 case AArch64::SEH_AddFP: 904 case AArch64::SEH_Nop: 905 case AArch64::SEH_PrologEnd: 906 case AArch64::SEH_EpilogStart: 907 case AArch64::SEH_EpilogEnd: 908 return true; 909 } 910 } 911 912 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 913 unsigned &SrcReg, unsigned &DstReg, 914 unsigned &SubIdx) const { 915 switch (MI.getOpcode()) { 916 default: 917 return false; 918 case AArch64::SBFMXri: // aka sxtw 919 case AArch64::UBFMXri: // aka uxtw 920 // Check for the 32 -> 64 bit extension case, these instructions can do 921 // much more. 922 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 923 return false; 924 // This is a signed or unsigned 32 -> 64 bit extension. 925 SrcReg = MI.getOperand(1).getReg(); 926 DstReg = MI.getOperand(0).getReg(); 927 SubIdx = AArch64::sub_32; 928 return true; 929 } 930 } 931 932 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 933 const MachineInstr &MIa, const MachineInstr &MIb) const { 934 const TargetRegisterInfo *TRI = &getRegisterInfo(); 935 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 936 int64_t OffsetA = 0, OffsetB = 0; 937 unsigned WidthA = 0, WidthB = 0; 938 939 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 940 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 941 942 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 943 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 944 return false; 945 946 // Retrieve the base, offset from the base and width. Width 947 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 948 // base are identical, and the offset of a lower memory access + 949 // the width doesn't overlap the offset of a higher memory access, 950 // then the memory accesses are different. 951 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) && 952 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) { 953 if (BaseOpA->isIdenticalTo(*BaseOpB)) { 954 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 955 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 956 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 957 if (LowOffset + LowWidth <= HighOffset) 958 return true; 959 } 960 } 961 return false; 962 } 963 964 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 965 const MachineBasicBlock *MBB, 966 const MachineFunction &MF) const { 967 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 968 return true; 969 switch (MI.getOpcode()) { 970 case AArch64::HINT: 971 // CSDB hints are scheduling barriers. 972 if (MI.getOperand(0).getImm() == 0x14) 973 return true; 974 break; 975 case AArch64::DSB: 976 case AArch64::ISB: 977 // DSB and ISB also are scheduling barriers. 978 return true; 979 default:; 980 } 981 return isSEHInstruction(MI); 982 } 983 984 /// analyzeCompare - For a comparison instruction, return the source registers 985 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 986 /// Return true if the comparison instruction can be analyzed. 987 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, 988 unsigned &SrcReg2, int &CmpMask, 989 int &CmpValue) const { 990 // The first operand can be a frame index where we'd normally expect a 991 // register. 992 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 993 if (!MI.getOperand(1).isReg()) 994 return false; 995 996 switch (MI.getOpcode()) { 997 default: 998 break; 999 case AArch64::SUBSWrr: 1000 case AArch64::SUBSWrs: 1001 case AArch64::SUBSWrx: 1002 case AArch64::SUBSXrr: 1003 case AArch64::SUBSXrs: 1004 case AArch64::SUBSXrx: 1005 case AArch64::ADDSWrr: 1006 case AArch64::ADDSWrs: 1007 case AArch64::ADDSWrx: 1008 case AArch64::ADDSXrr: 1009 case AArch64::ADDSXrs: 1010 case AArch64::ADDSXrx: 1011 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1012 SrcReg = MI.getOperand(1).getReg(); 1013 SrcReg2 = MI.getOperand(2).getReg(); 1014 CmpMask = ~0; 1015 CmpValue = 0; 1016 return true; 1017 case AArch64::SUBSWri: 1018 case AArch64::ADDSWri: 1019 case AArch64::SUBSXri: 1020 case AArch64::ADDSXri: 1021 SrcReg = MI.getOperand(1).getReg(); 1022 SrcReg2 = 0; 1023 CmpMask = ~0; 1024 // FIXME: In order to convert CmpValue to 0 or 1 1025 CmpValue = MI.getOperand(2).getImm() != 0; 1026 return true; 1027 case AArch64::ANDSWri: 1028 case AArch64::ANDSXri: 1029 // ANDS does not use the same encoding scheme as the others xxxS 1030 // instructions. 1031 SrcReg = MI.getOperand(1).getReg(); 1032 SrcReg2 = 0; 1033 CmpMask = ~0; 1034 // FIXME:The return val type of decodeLogicalImmediate is uint64_t, 1035 // while the type of CmpValue is int. When converting uint64_t to int, 1036 // the high 32 bits of uint64_t will be lost. 1037 // In fact it causes a bug in spec2006-483.xalancbmk 1038 // CmpValue is only used to compare with zero in OptimizeCompareInstr 1039 CmpValue = AArch64_AM::decodeLogicalImmediate( 1040 MI.getOperand(2).getImm(), 1041 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; 1042 return true; 1043 } 1044 1045 return false; 1046 } 1047 1048 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1049 MachineBasicBlock *MBB = Instr.getParent(); 1050 assert(MBB && "Can't get MachineBasicBlock here"); 1051 MachineFunction *MF = MBB->getParent(); 1052 assert(MF && "Can't get MachineFunction here"); 1053 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1054 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1055 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1056 1057 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1058 ++OpIdx) { 1059 MachineOperand &MO = Instr.getOperand(OpIdx); 1060 const TargetRegisterClass *OpRegCstraints = 1061 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1062 1063 // If there's no constraint, there's nothing to do. 1064 if (!OpRegCstraints) 1065 continue; 1066 // If the operand is a frame index, there's nothing to do here. 1067 // A frame index operand will resolve correctly during PEI. 1068 if (MO.isFI()) 1069 continue; 1070 1071 assert(MO.isReg() && 1072 "Operand has register constraints without being a register!"); 1073 1074 Register Reg = MO.getReg(); 1075 if (Register::isPhysicalRegister(Reg)) { 1076 if (!OpRegCstraints->contains(Reg)) 1077 return false; 1078 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1079 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1080 return false; 1081 } 1082 1083 return true; 1084 } 1085 1086 /// Return the opcode that does not set flags when possible - otherwise 1087 /// return the original opcode. The caller is responsible to do the actual 1088 /// substitution and legality checking. 1089 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1090 // Don't convert all compare instructions, because for some the zero register 1091 // encoding becomes the sp register. 1092 bool MIDefinesZeroReg = false; 1093 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1094 MIDefinesZeroReg = true; 1095 1096 switch (MI.getOpcode()) { 1097 default: 1098 return MI.getOpcode(); 1099 case AArch64::ADDSWrr: 1100 return AArch64::ADDWrr; 1101 case AArch64::ADDSWri: 1102 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1103 case AArch64::ADDSWrs: 1104 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1105 case AArch64::ADDSWrx: 1106 return AArch64::ADDWrx; 1107 case AArch64::ADDSXrr: 1108 return AArch64::ADDXrr; 1109 case AArch64::ADDSXri: 1110 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1111 case AArch64::ADDSXrs: 1112 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1113 case AArch64::ADDSXrx: 1114 return AArch64::ADDXrx; 1115 case AArch64::SUBSWrr: 1116 return AArch64::SUBWrr; 1117 case AArch64::SUBSWri: 1118 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1119 case AArch64::SUBSWrs: 1120 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1121 case AArch64::SUBSWrx: 1122 return AArch64::SUBWrx; 1123 case AArch64::SUBSXrr: 1124 return AArch64::SUBXrr; 1125 case AArch64::SUBSXri: 1126 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1127 case AArch64::SUBSXrs: 1128 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1129 case AArch64::SUBSXrx: 1130 return AArch64::SUBXrx; 1131 } 1132 } 1133 1134 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1135 1136 /// True when condition flags are accessed (either by writing or reading) 1137 /// on the instruction trace starting at From and ending at To. 1138 /// 1139 /// Note: If From and To are from different blocks it's assumed CC are accessed 1140 /// on the path. 1141 static bool areCFlagsAccessedBetweenInstrs( 1142 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1143 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1144 // Early exit if To is at the beginning of the BB. 1145 if (To == To->getParent()->begin()) 1146 return true; 1147 1148 // Check whether the instructions are in the same basic block 1149 // If not, assume the condition flags might get modified somewhere. 1150 if (To->getParent() != From->getParent()) 1151 return true; 1152 1153 // From must be above To. 1154 assert(std::find_if(++To.getReverse(), To->getParent()->rend(), 1155 [From](MachineInstr &MI) { 1156 return MI.getIterator() == From; 1157 }) != To->getParent()->rend()); 1158 1159 // We iterate backward starting \p To until we hit \p From. 1160 for (--To; To != From; --To) { 1161 const MachineInstr &Instr = *To; 1162 1163 if (((AccessToCheck & AK_Write) && 1164 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1165 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1166 return true; 1167 } 1168 return false; 1169 } 1170 1171 /// Try to optimize a compare instruction. A compare instruction is an 1172 /// instruction which produces AArch64::NZCV. It can be truly compare 1173 /// instruction 1174 /// when there are no uses of its destination register. 1175 /// 1176 /// The following steps are tried in order: 1177 /// 1. Convert CmpInstr into an unconditional version. 1178 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1179 /// condition code or an instruction which can be converted into such an 1180 /// instruction. 1181 /// Only comparison with zero is supported. 1182 bool AArch64InstrInfo::optimizeCompareInstr( 1183 MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, 1184 int CmpValue, const MachineRegisterInfo *MRI) const { 1185 assert(CmpInstr.getParent()); 1186 assert(MRI); 1187 1188 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1189 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1190 if (DeadNZCVIdx != -1) { 1191 if (CmpInstr.definesRegister(AArch64::WZR) || 1192 CmpInstr.definesRegister(AArch64::XZR)) { 1193 CmpInstr.eraseFromParent(); 1194 return true; 1195 } 1196 unsigned Opc = CmpInstr.getOpcode(); 1197 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1198 if (NewOpc == Opc) 1199 return false; 1200 const MCInstrDesc &MCID = get(NewOpc); 1201 CmpInstr.setDesc(MCID); 1202 CmpInstr.RemoveOperand(DeadNZCVIdx); 1203 bool succeeded = UpdateOperandRegClass(CmpInstr); 1204 (void)succeeded; 1205 assert(succeeded && "Some operands reg class are incompatible!"); 1206 return true; 1207 } 1208 1209 // Continue only if we have a "ri" where immediate is zero. 1210 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare 1211 // function. 1212 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); 1213 if (CmpValue != 0 || SrcReg2 != 0) 1214 return false; 1215 1216 // CmpInstr is a Compare instruction if destination register is not used. 1217 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1218 return false; 1219 1220 return substituteCmpToZero(CmpInstr, SrcReg, MRI); 1221 } 1222 1223 /// Get opcode of S version of Instr. 1224 /// If Instr is S version its opcode is returned. 1225 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1226 /// or we are not interested in it. 1227 static unsigned sForm(MachineInstr &Instr) { 1228 switch (Instr.getOpcode()) { 1229 default: 1230 return AArch64::INSTRUCTION_LIST_END; 1231 1232 case AArch64::ADDSWrr: 1233 case AArch64::ADDSWri: 1234 case AArch64::ADDSXrr: 1235 case AArch64::ADDSXri: 1236 case AArch64::SUBSWrr: 1237 case AArch64::SUBSWri: 1238 case AArch64::SUBSXrr: 1239 case AArch64::SUBSXri: 1240 return Instr.getOpcode(); 1241 1242 case AArch64::ADDWrr: 1243 return AArch64::ADDSWrr; 1244 case AArch64::ADDWri: 1245 return AArch64::ADDSWri; 1246 case AArch64::ADDXrr: 1247 return AArch64::ADDSXrr; 1248 case AArch64::ADDXri: 1249 return AArch64::ADDSXri; 1250 case AArch64::ADCWr: 1251 return AArch64::ADCSWr; 1252 case AArch64::ADCXr: 1253 return AArch64::ADCSXr; 1254 case AArch64::SUBWrr: 1255 return AArch64::SUBSWrr; 1256 case AArch64::SUBWri: 1257 return AArch64::SUBSWri; 1258 case AArch64::SUBXrr: 1259 return AArch64::SUBSXrr; 1260 case AArch64::SUBXri: 1261 return AArch64::SUBSXri; 1262 case AArch64::SBCWr: 1263 return AArch64::SBCSWr; 1264 case AArch64::SBCXr: 1265 return AArch64::SBCSXr; 1266 case AArch64::ANDWri: 1267 return AArch64::ANDSWri; 1268 case AArch64::ANDXri: 1269 return AArch64::ANDSXri; 1270 } 1271 } 1272 1273 /// Check if AArch64::NZCV should be alive in successors of MBB. 1274 static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { 1275 for (auto *BB : MBB->successors()) 1276 if (BB->isLiveIn(AArch64::NZCV)) 1277 return true; 1278 return false; 1279 } 1280 1281 namespace { 1282 1283 struct UsedNZCV { 1284 bool N = false; 1285 bool Z = false; 1286 bool C = false; 1287 bool V = false; 1288 1289 UsedNZCV() = default; 1290 1291 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1292 this->N |= UsedFlags.N; 1293 this->Z |= UsedFlags.Z; 1294 this->C |= UsedFlags.C; 1295 this->V |= UsedFlags.V; 1296 return *this; 1297 } 1298 }; 1299 1300 } // end anonymous namespace 1301 1302 /// Find a condition code used by the instruction. 1303 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1304 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1305 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1306 switch (Instr.getOpcode()) { 1307 default: 1308 return AArch64CC::Invalid; 1309 1310 case AArch64::Bcc: { 1311 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1312 assert(Idx >= 2); 1313 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); 1314 } 1315 1316 case AArch64::CSINVWr: 1317 case AArch64::CSINVXr: 1318 case AArch64::CSINCWr: 1319 case AArch64::CSINCXr: 1320 case AArch64::CSELWr: 1321 case AArch64::CSELXr: 1322 case AArch64::CSNEGWr: 1323 case AArch64::CSNEGXr: 1324 case AArch64::FCSELSrrr: 1325 case AArch64::FCSELDrrr: { 1326 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1327 assert(Idx >= 1); 1328 return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); 1329 } 1330 } 1331 } 1332 1333 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1334 assert(CC != AArch64CC::Invalid); 1335 UsedNZCV UsedFlags; 1336 switch (CC) { 1337 default: 1338 break; 1339 1340 case AArch64CC::EQ: // Z set 1341 case AArch64CC::NE: // Z clear 1342 UsedFlags.Z = true; 1343 break; 1344 1345 case AArch64CC::HI: // Z clear and C set 1346 case AArch64CC::LS: // Z set or C clear 1347 UsedFlags.Z = true; 1348 LLVM_FALLTHROUGH; 1349 case AArch64CC::HS: // C set 1350 case AArch64CC::LO: // C clear 1351 UsedFlags.C = true; 1352 break; 1353 1354 case AArch64CC::MI: // N set 1355 case AArch64CC::PL: // N clear 1356 UsedFlags.N = true; 1357 break; 1358 1359 case AArch64CC::VS: // V set 1360 case AArch64CC::VC: // V clear 1361 UsedFlags.V = true; 1362 break; 1363 1364 case AArch64CC::GT: // Z clear, N and V the same 1365 case AArch64CC::LE: // Z set, N and V differ 1366 UsedFlags.Z = true; 1367 LLVM_FALLTHROUGH; 1368 case AArch64CC::GE: // N and V the same 1369 case AArch64CC::LT: // N and V differ 1370 UsedFlags.N = true; 1371 UsedFlags.V = true; 1372 break; 1373 } 1374 return UsedFlags; 1375 } 1376 1377 static bool isADDSRegImm(unsigned Opcode) { 1378 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1379 } 1380 1381 static bool isSUBSRegImm(unsigned Opcode) { 1382 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1383 } 1384 1385 /// Check if CmpInstr can be substituted by MI. 1386 /// 1387 /// CmpInstr can be substituted: 1388 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1389 /// - and, MI and CmpInstr are from the same MachineBB 1390 /// - and, condition flags are not alive in successors of the CmpInstr parent 1391 /// - and, if MI opcode is the S form there must be no defs of flags between 1392 /// MI and CmpInstr 1393 /// or if MI opcode is not the S form there must be neither defs of flags 1394 /// nor uses of flags between MI and CmpInstr. 1395 /// - and C/V flags are not used after CmpInstr 1396 static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, 1397 const TargetRegisterInfo *TRI) { 1398 assert(MI); 1399 assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); 1400 assert(CmpInstr); 1401 1402 const unsigned CmpOpcode = CmpInstr->getOpcode(); 1403 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1404 return false; 1405 1406 if (MI->getParent() != CmpInstr->getParent()) 1407 return false; 1408 1409 if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) 1410 return false; 1411 1412 AccessKind AccessToCheck = AK_Write; 1413 if (sForm(*MI) != MI->getOpcode()) 1414 AccessToCheck = AK_All; 1415 if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) 1416 return false; 1417 1418 UsedNZCV NZCVUsedAfterCmp; 1419 for (auto I = std::next(CmpInstr->getIterator()), 1420 E = CmpInstr->getParent()->instr_end(); 1421 I != E; ++I) { 1422 const MachineInstr &Instr = *I; 1423 if (Instr.readsRegister(AArch64::NZCV, TRI)) { 1424 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1425 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1426 return false; 1427 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1428 } 1429 1430 if (Instr.modifiesRegister(AArch64::NZCV, TRI)) 1431 break; 1432 } 1433 1434 return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; 1435 } 1436 1437 /// Substitute an instruction comparing to zero with another instruction 1438 /// which produces needed condition flags. 1439 /// 1440 /// Return true on success. 1441 bool AArch64InstrInfo::substituteCmpToZero( 1442 MachineInstr &CmpInstr, unsigned SrcReg, 1443 const MachineRegisterInfo *MRI) const { 1444 assert(MRI); 1445 // Get the unique definition of SrcReg. 1446 MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); 1447 if (!MI) 1448 return false; 1449 1450 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1451 1452 unsigned NewOpc = sForm(*MI); 1453 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1454 return false; 1455 1456 if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) 1457 return false; 1458 1459 // Update the instruction to set NZCV. 1460 MI->setDesc(get(NewOpc)); 1461 CmpInstr.eraseFromParent(); 1462 bool succeeded = UpdateOperandRegClass(*MI); 1463 (void)succeeded; 1464 assert(succeeded && "Some operands reg class are incompatible!"); 1465 MI->addRegisterDefined(AArch64::NZCV, TRI); 1466 return true; 1467 } 1468 1469 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1470 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1471 MI.getOpcode() != AArch64::CATCHRET) 1472 return false; 1473 1474 MachineBasicBlock &MBB = *MI.getParent(); 1475 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1476 auto TRI = Subtarget.getRegisterInfo(); 1477 DebugLoc DL = MI.getDebugLoc(); 1478 1479 if (MI.getOpcode() == AArch64::CATCHRET) { 1480 // Skip to the first instruction before the epilog. 1481 const TargetInstrInfo *TII = 1482 MBB.getParent()->getSubtarget().getInstrInfo(); 1483 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1484 auto MBBI = MachineBasicBlock::iterator(MI); 1485 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1486 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1487 FirstEpilogSEH != MBB.begin()) 1488 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1489 if (FirstEpilogSEH != MBB.begin()) 1490 FirstEpilogSEH = std::next(FirstEpilogSEH); 1491 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1492 .addReg(AArch64::X0, RegState::Define) 1493 .addMBB(TargetMBB); 1494 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1495 .addReg(AArch64::X0, RegState::Define) 1496 .addReg(AArch64::X0) 1497 .addMBB(TargetMBB) 1498 .addImm(0); 1499 return true; 1500 } 1501 1502 Register Reg = MI.getOperand(0).getReg(); 1503 const GlobalValue *GV = 1504 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1505 const TargetMachine &TM = MBB.getParent()->getTarget(); 1506 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1507 const unsigned char MO_NC = AArch64II::MO_NC; 1508 1509 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1510 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1511 .addGlobalAddress(GV, 0, OpFlags); 1512 if (Subtarget.isTargetILP32()) { 1513 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1514 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1515 .addDef(Reg32, RegState::Dead) 1516 .addUse(Reg, RegState::Kill) 1517 .addImm(0) 1518 .addMemOperand(*MI.memoperands_begin()) 1519 .addDef(Reg, RegState::Implicit); 1520 } else { 1521 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1522 .addReg(Reg, RegState::Kill) 1523 .addImm(0) 1524 .addMemOperand(*MI.memoperands_begin()); 1525 } 1526 } else if (TM.getCodeModel() == CodeModel::Large) { 1527 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1528 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1529 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1530 .addImm(0); 1531 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1532 .addReg(Reg, RegState::Kill) 1533 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 1534 .addImm(16); 1535 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1536 .addReg(Reg, RegState::Kill) 1537 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 1538 .addImm(32); 1539 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1540 .addReg(Reg, RegState::Kill) 1541 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 1542 .addImm(48); 1543 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1544 .addReg(Reg, RegState::Kill) 1545 .addImm(0) 1546 .addMemOperand(*MI.memoperands_begin()); 1547 } else if (TM.getCodeModel() == CodeModel::Tiny) { 1548 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 1549 .addGlobalAddress(GV, 0, OpFlags); 1550 } else { 1551 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 1552 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 1553 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 1554 if (Subtarget.isTargetILP32()) { 1555 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1556 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1557 .addDef(Reg32, RegState::Dead) 1558 .addUse(Reg, RegState::Kill) 1559 .addGlobalAddress(GV, 0, LoFlags) 1560 .addMemOperand(*MI.memoperands_begin()) 1561 .addDef(Reg, RegState::Implicit); 1562 } else { 1563 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1564 .addReg(Reg, RegState::Kill) 1565 .addGlobalAddress(GV, 0, LoFlags) 1566 .addMemOperand(*MI.memoperands_begin()); 1567 } 1568 } 1569 1570 MBB.erase(MI); 1571 1572 return true; 1573 } 1574 1575 // Return true if this instruction simply sets its single destination register 1576 // to zero. This is equivalent to a register rename of the zero-register. 1577 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 1578 switch (MI.getOpcode()) { 1579 default: 1580 break; 1581 case AArch64::MOVZWi: 1582 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 1583 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 1584 assert(MI.getDesc().getNumOperands() == 3 && 1585 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 1586 return true; 1587 } 1588 break; 1589 case AArch64::ANDWri: // and Rd, Rzr, #imm 1590 return MI.getOperand(1).getReg() == AArch64::WZR; 1591 case AArch64::ANDXri: 1592 return MI.getOperand(1).getReg() == AArch64::XZR; 1593 case TargetOpcode::COPY: 1594 return MI.getOperand(1).getReg() == AArch64::WZR; 1595 } 1596 return false; 1597 } 1598 1599 // Return true if this instruction simply renames a general register without 1600 // modifying bits. 1601 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 1602 switch (MI.getOpcode()) { 1603 default: 1604 break; 1605 case TargetOpcode::COPY: { 1606 // GPR32 copies will by lowered to ORRXrs 1607 Register DstReg = MI.getOperand(0).getReg(); 1608 return (AArch64::GPR32RegClass.contains(DstReg) || 1609 AArch64::GPR64RegClass.contains(DstReg)); 1610 } 1611 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 1612 if (MI.getOperand(1).getReg() == AArch64::XZR) { 1613 assert(MI.getDesc().getNumOperands() == 4 && 1614 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 1615 return true; 1616 } 1617 break; 1618 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 1619 if (MI.getOperand(2).getImm() == 0) { 1620 assert(MI.getDesc().getNumOperands() == 4 && 1621 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 1622 return true; 1623 } 1624 break; 1625 } 1626 return false; 1627 } 1628 1629 // Return true if this instruction simply renames a general register without 1630 // modifying bits. 1631 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 1632 switch (MI.getOpcode()) { 1633 default: 1634 break; 1635 case TargetOpcode::COPY: { 1636 // FPR64 copies will by lowered to ORR.16b 1637 Register DstReg = MI.getOperand(0).getReg(); 1638 return (AArch64::FPR64RegClass.contains(DstReg) || 1639 AArch64::FPR128RegClass.contains(DstReg)); 1640 } 1641 case AArch64::ORRv16i8: 1642 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 1643 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 1644 "invalid ORRv16i8 operands"); 1645 return true; 1646 } 1647 break; 1648 } 1649 return false; 1650 } 1651 1652 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 1653 int &FrameIndex) const { 1654 switch (MI.getOpcode()) { 1655 default: 1656 break; 1657 case AArch64::LDRWui: 1658 case AArch64::LDRXui: 1659 case AArch64::LDRBui: 1660 case AArch64::LDRHui: 1661 case AArch64::LDRSui: 1662 case AArch64::LDRDui: 1663 case AArch64::LDRQui: 1664 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1665 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1666 FrameIndex = MI.getOperand(1).getIndex(); 1667 return MI.getOperand(0).getReg(); 1668 } 1669 break; 1670 } 1671 1672 return 0; 1673 } 1674 1675 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 1676 int &FrameIndex) const { 1677 switch (MI.getOpcode()) { 1678 default: 1679 break; 1680 case AArch64::STRWui: 1681 case AArch64::STRXui: 1682 case AArch64::STRBui: 1683 case AArch64::STRHui: 1684 case AArch64::STRSui: 1685 case AArch64::STRDui: 1686 case AArch64::STRQui: 1687 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 1688 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 1689 FrameIndex = MI.getOperand(1).getIndex(); 1690 return MI.getOperand(0).getReg(); 1691 } 1692 break; 1693 } 1694 return 0; 1695 } 1696 1697 /// Check all MachineMemOperands for a hint to suppress pairing. 1698 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 1699 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1700 return MMO->getFlags() & MOSuppressPair; 1701 }); 1702 } 1703 1704 /// Set a flag on the first MachineMemOperand to suppress pairing. 1705 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 1706 if (MI.memoperands_empty()) 1707 return; 1708 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 1709 } 1710 1711 /// Check all MachineMemOperands for a hint that the load/store is strided. 1712 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 1713 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 1714 return MMO->getFlags() & MOStridedAccess; 1715 }); 1716 } 1717 1718 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) { 1719 switch (Opc) { 1720 default: 1721 return false; 1722 case AArch64::STURSi: 1723 case AArch64::STURDi: 1724 case AArch64::STURQi: 1725 case AArch64::STURBBi: 1726 case AArch64::STURHHi: 1727 case AArch64::STURWi: 1728 case AArch64::STURXi: 1729 case AArch64::LDURSi: 1730 case AArch64::LDURDi: 1731 case AArch64::LDURQi: 1732 case AArch64::LDURWi: 1733 case AArch64::LDURXi: 1734 case AArch64::LDURSWi: 1735 case AArch64::LDURHHi: 1736 case AArch64::LDURBBi: 1737 case AArch64::LDURSBWi: 1738 case AArch64::LDURSHWi: 1739 return true; 1740 } 1741 } 1742 1743 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 1744 switch (Opc) { 1745 default: return {}; 1746 case AArch64::PRFMui: return AArch64::PRFUMi; 1747 case AArch64::LDRXui: return AArch64::LDURXi; 1748 case AArch64::LDRWui: return AArch64::LDURWi; 1749 case AArch64::LDRBui: return AArch64::LDURBi; 1750 case AArch64::LDRHui: return AArch64::LDURHi; 1751 case AArch64::LDRSui: return AArch64::LDURSi; 1752 case AArch64::LDRDui: return AArch64::LDURDi; 1753 case AArch64::LDRQui: return AArch64::LDURQi; 1754 case AArch64::LDRBBui: return AArch64::LDURBBi; 1755 case AArch64::LDRHHui: return AArch64::LDURHHi; 1756 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 1757 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 1758 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 1759 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 1760 case AArch64::LDRSWui: return AArch64::LDURSWi; 1761 case AArch64::STRXui: return AArch64::STURXi; 1762 case AArch64::STRWui: return AArch64::STURWi; 1763 case AArch64::STRBui: return AArch64::STURBi; 1764 case AArch64::STRHui: return AArch64::STURHi; 1765 case AArch64::STRSui: return AArch64::STURSi; 1766 case AArch64::STRDui: return AArch64::STURDi; 1767 case AArch64::STRQui: return AArch64::STURQi; 1768 case AArch64::STRBBui: return AArch64::STURBBi; 1769 case AArch64::STRHHui: return AArch64::STURHHi; 1770 } 1771 } 1772 1773 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 1774 switch (Opc) { 1775 default: 1776 return 2; 1777 case AArch64::LDPXi: 1778 case AArch64::LDPDi: 1779 case AArch64::STPXi: 1780 case AArch64::STPDi: 1781 case AArch64::LDNPXi: 1782 case AArch64::LDNPDi: 1783 case AArch64::STNPXi: 1784 case AArch64::STNPDi: 1785 case AArch64::LDPQi: 1786 case AArch64::STPQi: 1787 case AArch64::LDNPQi: 1788 case AArch64::STNPQi: 1789 case AArch64::LDPWi: 1790 case AArch64::LDPSi: 1791 case AArch64::STPWi: 1792 case AArch64::STPSi: 1793 case AArch64::LDNPWi: 1794 case AArch64::LDNPSi: 1795 case AArch64::STNPWi: 1796 case AArch64::STNPSi: 1797 case AArch64::LDG: 1798 case AArch64::STGPi: 1799 return 3; 1800 case AArch64::ADDG: 1801 case AArch64::STGOffset: 1802 return 2; 1803 } 1804 } 1805 1806 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 1807 switch (MI.getOpcode()) { 1808 default: 1809 return false; 1810 // Scaled instructions. 1811 case AArch64::STRSui: 1812 case AArch64::STRDui: 1813 case AArch64::STRQui: 1814 case AArch64::STRXui: 1815 case AArch64::STRWui: 1816 case AArch64::LDRSui: 1817 case AArch64::LDRDui: 1818 case AArch64::LDRQui: 1819 case AArch64::LDRXui: 1820 case AArch64::LDRWui: 1821 case AArch64::LDRSWui: 1822 // Unscaled instructions. 1823 case AArch64::STURSi: 1824 case AArch64::STURDi: 1825 case AArch64::STURQi: 1826 case AArch64::STURWi: 1827 case AArch64::STURXi: 1828 case AArch64::LDURSi: 1829 case AArch64::LDURDi: 1830 case AArch64::LDURQi: 1831 case AArch64::LDURWi: 1832 case AArch64::LDURXi: 1833 case AArch64::LDURSWi: 1834 return true; 1835 } 1836 } 1837 1838 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 1839 bool &Is64Bit) { 1840 switch (Opc) { 1841 default: 1842 llvm_unreachable("Opcode has no flag setting equivalent!"); 1843 // 32-bit cases: 1844 case AArch64::ADDWri: 1845 Is64Bit = false; 1846 return AArch64::ADDSWri; 1847 case AArch64::ADDWrr: 1848 Is64Bit = false; 1849 return AArch64::ADDSWrr; 1850 case AArch64::ADDWrs: 1851 Is64Bit = false; 1852 return AArch64::ADDSWrs; 1853 case AArch64::ADDWrx: 1854 Is64Bit = false; 1855 return AArch64::ADDSWrx; 1856 case AArch64::ANDWri: 1857 Is64Bit = false; 1858 return AArch64::ANDSWri; 1859 case AArch64::ANDWrr: 1860 Is64Bit = false; 1861 return AArch64::ANDSWrr; 1862 case AArch64::ANDWrs: 1863 Is64Bit = false; 1864 return AArch64::ANDSWrs; 1865 case AArch64::BICWrr: 1866 Is64Bit = false; 1867 return AArch64::BICSWrr; 1868 case AArch64::BICWrs: 1869 Is64Bit = false; 1870 return AArch64::BICSWrs; 1871 case AArch64::SUBWri: 1872 Is64Bit = false; 1873 return AArch64::SUBSWri; 1874 case AArch64::SUBWrr: 1875 Is64Bit = false; 1876 return AArch64::SUBSWrr; 1877 case AArch64::SUBWrs: 1878 Is64Bit = false; 1879 return AArch64::SUBSWrs; 1880 case AArch64::SUBWrx: 1881 Is64Bit = false; 1882 return AArch64::SUBSWrx; 1883 // 64-bit cases: 1884 case AArch64::ADDXri: 1885 Is64Bit = true; 1886 return AArch64::ADDSXri; 1887 case AArch64::ADDXrr: 1888 Is64Bit = true; 1889 return AArch64::ADDSXrr; 1890 case AArch64::ADDXrs: 1891 Is64Bit = true; 1892 return AArch64::ADDSXrs; 1893 case AArch64::ADDXrx: 1894 Is64Bit = true; 1895 return AArch64::ADDSXrx; 1896 case AArch64::ANDXri: 1897 Is64Bit = true; 1898 return AArch64::ANDSXri; 1899 case AArch64::ANDXrr: 1900 Is64Bit = true; 1901 return AArch64::ANDSXrr; 1902 case AArch64::ANDXrs: 1903 Is64Bit = true; 1904 return AArch64::ANDSXrs; 1905 case AArch64::BICXrr: 1906 Is64Bit = true; 1907 return AArch64::BICSXrr; 1908 case AArch64::BICXrs: 1909 Is64Bit = true; 1910 return AArch64::BICSXrs; 1911 case AArch64::SUBXri: 1912 Is64Bit = true; 1913 return AArch64::SUBSXri; 1914 case AArch64::SUBXrr: 1915 Is64Bit = true; 1916 return AArch64::SUBSXrr; 1917 case AArch64::SUBXrs: 1918 Is64Bit = true; 1919 return AArch64::SUBSXrs; 1920 case AArch64::SUBXrx: 1921 Is64Bit = true; 1922 return AArch64::SUBSXrx; 1923 } 1924 } 1925 1926 // Is this a candidate for ld/st merging or pairing? For example, we don't 1927 // touch volatiles or load/stores that have a hint to avoid pair formation. 1928 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 1929 // If this is a volatile load/store, don't mess with it. 1930 if (MI.hasOrderedMemoryRef()) 1931 return false; 1932 1933 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 1934 assert((MI.getOperand(1).isReg() || MI.getOperand(1).isFI()) && 1935 "Expected a reg or frame index operand."); 1936 if (!MI.getOperand(2).isImm()) 1937 return false; 1938 1939 // Can't merge/pair if the instruction modifies the base register. 1940 // e.g., ldr x0, [x0] 1941 // This case will never occur with an FI base. 1942 if (MI.getOperand(1).isReg()) { 1943 Register BaseReg = MI.getOperand(1).getReg(); 1944 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1945 if (MI.modifiesRegister(BaseReg, TRI)) 1946 return false; 1947 } 1948 1949 // Check if this load/store has a hint to avoid pair formation. 1950 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1951 if (isLdStPairSuppressed(MI)) 1952 return false; 1953 1954 // Do not pair any callee-save store/reload instructions in the 1955 // prologue/epilogue if the CFI information encoded the operations as separate 1956 // instructions, as that will cause the size of the actual prologue to mismatch 1957 // with the prologue size recorded in the Windows CFI. 1958 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 1959 bool NeedsWinCFI = MAI->usesWindowsCFI() && 1960 MI.getMF()->getFunction().needsUnwindTableEntry(); 1961 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 1962 MI.getFlag(MachineInstr::FrameDestroy))) 1963 return false; 1964 1965 // On some CPUs quad load/store pairs are slower than two single load/stores. 1966 if (Subtarget.isPaired128Slow()) { 1967 switch (MI.getOpcode()) { 1968 default: 1969 break; 1970 case AArch64::LDURQi: 1971 case AArch64::STURQi: 1972 case AArch64::LDRQui: 1973 case AArch64::STRQui: 1974 return false; 1975 } 1976 } 1977 1978 return true; 1979 } 1980 1981 bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, 1982 const MachineOperand *&BaseOp, 1983 int64_t &Offset, 1984 const TargetRegisterInfo *TRI) const { 1985 if (!LdSt.mayLoadOrStore()) 1986 return false; 1987 1988 unsigned Width; 1989 return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); 1990 } 1991 1992 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 1993 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 1994 unsigned &Width, const TargetRegisterInfo *TRI) const { 1995 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 1996 // Handle only loads/stores with base register followed by immediate offset. 1997 if (LdSt.getNumExplicitOperands() == 3) { 1998 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 1999 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2000 !LdSt.getOperand(2).isImm()) 2001 return false; 2002 } else if (LdSt.getNumExplicitOperands() == 4) { 2003 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2004 if (!LdSt.getOperand(1).isReg() || 2005 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2006 !LdSt.getOperand(3).isImm()) 2007 return false; 2008 } else 2009 return false; 2010 2011 // Get the scaling factor for the instruction and set the width for the 2012 // instruction. 2013 unsigned Scale = 0; 2014 int64_t Dummy1, Dummy2; 2015 2016 // If this returns false, then it's an instruction we don't want to handle. 2017 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2018 return false; 2019 2020 // Compute the offset. Offset is calculated as the immediate operand 2021 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2022 // set to 1. 2023 if (LdSt.getNumExplicitOperands() == 3) { 2024 BaseOp = &LdSt.getOperand(1); 2025 Offset = LdSt.getOperand(2).getImm() * Scale; 2026 } else { 2027 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2028 BaseOp = &LdSt.getOperand(2); 2029 Offset = LdSt.getOperand(3).getImm() * Scale; 2030 } 2031 2032 if (!BaseOp->isReg() && !BaseOp->isFI()) 2033 return false; 2034 2035 return true; 2036 } 2037 2038 MachineOperand & 2039 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2040 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2041 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2042 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2043 return OfsOp; 2044 } 2045 2046 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, 2047 unsigned &Width, int64_t &MinOffset, 2048 int64_t &MaxOffset) { 2049 switch (Opcode) { 2050 // Not a memory operation or something we want to handle. 2051 default: 2052 Scale = Width = 0; 2053 MinOffset = MaxOffset = 0; 2054 return false; 2055 case AArch64::STRWpost: 2056 case AArch64::LDRWpost: 2057 Width = 32; 2058 Scale = 4; 2059 MinOffset = -256; 2060 MaxOffset = 255; 2061 break; 2062 case AArch64::LDURQi: 2063 case AArch64::STURQi: 2064 Width = 16; 2065 Scale = 1; 2066 MinOffset = -256; 2067 MaxOffset = 255; 2068 break; 2069 case AArch64::PRFUMi: 2070 case AArch64::LDURXi: 2071 case AArch64::LDURDi: 2072 case AArch64::STURXi: 2073 case AArch64::STURDi: 2074 Width = 8; 2075 Scale = 1; 2076 MinOffset = -256; 2077 MaxOffset = 255; 2078 break; 2079 case AArch64::LDURWi: 2080 case AArch64::LDURSi: 2081 case AArch64::LDURSWi: 2082 case AArch64::STURWi: 2083 case AArch64::STURSi: 2084 Width = 4; 2085 Scale = 1; 2086 MinOffset = -256; 2087 MaxOffset = 255; 2088 break; 2089 case AArch64::LDURHi: 2090 case AArch64::LDURHHi: 2091 case AArch64::LDURSHXi: 2092 case AArch64::LDURSHWi: 2093 case AArch64::STURHi: 2094 case AArch64::STURHHi: 2095 Width = 2; 2096 Scale = 1; 2097 MinOffset = -256; 2098 MaxOffset = 255; 2099 break; 2100 case AArch64::LDURBi: 2101 case AArch64::LDURBBi: 2102 case AArch64::LDURSBXi: 2103 case AArch64::LDURSBWi: 2104 case AArch64::STURBi: 2105 case AArch64::STURBBi: 2106 Width = 1; 2107 Scale = 1; 2108 MinOffset = -256; 2109 MaxOffset = 255; 2110 break; 2111 case AArch64::LDPQi: 2112 case AArch64::LDNPQi: 2113 case AArch64::STPQi: 2114 case AArch64::STNPQi: 2115 Scale = 16; 2116 Width = 32; 2117 MinOffset = -64; 2118 MaxOffset = 63; 2119 break; 2120 case AArch64::LDRQui: 2121 case AArch64::STRQui: 2122 Scale = Width = 16; 2123 MinOffset = 0; 2124 MaxOffset = 4095; 2125 break; 2126 case AArch64::LDPXi: 2127 case AArch64::LDPDi: 2128 case AArch64::LDNPXi: 2129 case AArch64::LDNPDi: 2130 case AArch64::STPXi: 2131 case AArch64::STPDi: 2132 case AArch64::STNPXi: 2133 case AArch64::STNPDi: 2134 Scale = 8; 2135 Width = 16; 2136 MinOffset = -64; 2137 MaxOffset = 63; 2138 break; 2139 case AArch64::PRFMui: 2140 case AArch64::LDRXui: 2141 case AArch64::LDRDui: 2142 case AArch64::STRXui: 2143 case AArch64::STRDui: 2144 Scale = Width = 8; 2145 MinOffset = 0; 2146 MaxOffset = 4095; 2147 break; 2148 case AArch64::LDPWi: 2149 case AArch64::LDPSi: 2150 case AArch64::LDNPWi: 2151 case AArch64::LDNPSi: 2152 case AArch64::STPWi: 2153 case AArch64::STPSi: 2154 case AArch64::STNPWi: 2155 case AArch64::STNPSi: 2156 Scale = 4; 2157 Width = 8; 2158 MinOffset = -64; 2159 MaxOffset = 63; 2160 break; 2161 case AArch64::LDRWui: 2162 case AArch64::LDRSui: 2163 case AArch64::LDRSWui: 2164 case AArch64::STRWui: 2165 case AArch64::STRSui: 2166 Scale = Width = 4; 2167 MinOffset = 0; 2168 MaxOffset = 4095; 2169 break; 2170 case AArch64::LDRHui: 2171 case AArch64::LDRHHui: 2172 case AArch64::LDRSHWui: 2173 case AArch64::LDRSHXui: 2174 case AArch64::STRHui: 2175 case AArch64::STRHHui: 2176 Scale = Width = 2; 2177 MinOffset = 0; 2178 MaxOffset = 4095; 2179 break; 2180 case AArch64::LDRBui: 2181 case AArch64::LDRBBui: 2182 case AArch64::LDRSBWui: 2183 case AArch64::LDRSBXui: 2184 case AArch64::STRBui: 2185 case AArch64::STRBBui: 2186 Scale = Width = 1; 2187 MinOffset = 0; 2188 MaxOffset = 4095; 2189 break; 2190 case AArch64::ADDG: 2191 Scale = 16; 2192 Width = 0; 2193 MinOffset = 0; 2194 MaxOffset = 63; 2195 break; 2196 case AArch64::TAGPstack: 2197 Scale = 16; 2198 Width = 0; 2199 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2200 // of 63 (not 64!). 2201 MinOffset = -63; 2202 MaxOffset = 63; 2203 break; 2204 case AArch64::LDG: 2205 case AArch64::STGOffset: 2206 case AArch64::STZGOffset: 2207 Scale = Width = 16; 2208 MinOffset = -256; 2209 MaxOffset = 255; 2210 break; 2211 case AArch64::LDR_PXI: 2212 case AArch64::STR_PXI: 2213 Scale = Width = 2; 2214 MinOffset = -256; 2215 MaxOffset = 255; 2216 break; 2217 case AArch64::LDR_ZXI: 2218 case AArch64::STR_ZXI: 2219 Scale = Width = 16; 2220 MinOffset = -256; 2221 MaxOffset = 255; 2222 break; 2223 case AArch64::ST2GOffset: 2224 case AArch64::STZ2GOffset: 2225 Scale = 16; 2226 Width = 32; 2227 MinOffset = -256; 2228 MaxOffset = 255; 2229 break; 2230 case AArch64::STGPi: 2231 Scale = Width = 16; 2232 MinOffset = -64; 2233 MaxOffset = 63; 2234 break; 2235 } 2236 2237 return true; 2238 } 2239 2240 // Scaling factor for unscaled load or store. 2241 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2242 switch (Opc) { 2243 default: 2244 llvm_unreachable("Opcode has unknown scale!"); 2245 case AArch64::LDRBBui: 2246 case AArch64::LDURBBi: 2247 case AArch64::LDRSBWui: 2248 case AArch64::LDURSBWi: 2249 case AArch64::STRBBui: 2250 case AArch64::STURBBi: 2251 return 1; 2252 case AArch64::LDRHHui: 2253 case AArch64::LDURHHi: 2254 case AArch64::LDRSHWui: 2255 case AArch64::LDURSHWi: 2256 case AArch64::STRHHui: 2257 case AArch64::STURHHi: 2258 return 2; 2259 case AArch64::LDRSui: 2260 case AArch64::LDURSi: 2261 case AArch64::LDRSWui: 2262 case AArch64::LDURSWi: 2263 case AArch64::LDRWui: 2264 case AArch64::LDURWi: 2265 case AArch64::STRSui: 2266 case AArch64::STURSi: 2267 case AArch64::STRWui: 2268 case AArch64::STURWi: 2269 case AArch64::LDPSi: 2270 case AArch64::LDPSWi: 2271 case AArch64::LDPWi: 2272 case AArch64::STPSi: 2273 case AArch64::STPWi: 2274 return 4; 2275 case AArch64::LDRDui: 2276 case AArch64::LDURDi: 2277 case AArch64::LDRXui: 2278 case AArch64::LDURXi: 2279 case AArch64::STRDui: 2280 case AArch64::STURDi: 2281 case AArch64::STRXui: 2282 case AArch64::STURXi: 2283 case AArch64::LDPDi: 2284 case AArch64::LDPXi: 2285 case AArch64::STPDi: 2286 case AArch64::STPXi: 2287 return 8; 2288 case AArch64::LDRQui: 2289 case AArch64::LDURQi: 2290 case AArch64::STRQui: 2291 case AArch64::STURQi: 2292 case AArch64::LDPQi: 2293 case AArch64::STPQi: 2294 case AArch64::STGOffset: 2295 case AArch64::STZGOffset: 2296 case AArch64::ST2GOffset: 2297 case AArch64::STZ2GOffset: 2298 case AArch64::STGPi: 2299 return 16; 2300 } 2301 } 2302 2303 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 2304 // scaled. 2305 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 2306 int Scale = AArch64InstrInfo::getMemScale(Opc); 2307 2308 // If the byte-offset isn't a multiple of the stride, we can't scale this 2309 // offset. 2310 if (Offset % Scale != 0) 2311 return false; 2312 2313 // Convert the byte-offset used by unscaled into an "element" offset used 2314 // by the scaled pair load/store instructions. 2315 Offset /= Scale; 2316 return true; 2317 } 2318 2319 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 2320 if (FirstOpc == SecondOpc) 2321 return true; 2322 // We can also pair sign-ext and zero-ext instructions. 2323 switch (FirstOpc) { 2324 default: 2325 return false; 2326 case AArch64::LDRWui: 2327 case AArch64::LDURWi: 2328 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 2329 case AArch64::LDRSWui: 2330 case AArch64::LDURSWi: 2331 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 2332 } 2333 // These instructions can't be paired based on their opcodes. 2334 return false; 2335 } 2336 2337 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 2338 int64_t Offset1, unsigned Opcode1, int FI2, 2339 int64_t Offset2, unsigned Opcode2) { 2340 // Accesses through fixed stack object frame indices may access a different 2341 // fixed stack slot. Check that the object offsets + offsets match. 2342 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 2343 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 2344 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 2345 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 2346 // Convert to scaled object offsets. 2347 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 2348 if (ObjectOffset1 % Scale1 != 0) 2349 return false; 2350 ObjectOffset1 /= Scale1; 2351 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 2352 if (ObjectOffset2 % Scale2 != 0) 2353 return false; 2354 ObjectOffset2 /= Scale2; 2355 ObjectOffset1 += Offset1; 2356 ObjectOffset2 += Offset2; 2357 return ObjectOffset1 + 1 == ObjectOffset2; 2358 } 2359 2360 return FI1 == FI2; 2361 } 2362 2363 /// Detect opportunities for ldp/stp formation. 2364 /// 2365 /// Only called for LdSt for which getMemOperandWithOffset returns true. 2366 bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, 2367 const MachineOperand &BaseOp2, 2368 unsigned NumLoads) const { 2369 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 2370 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 2371 if (BaseOp1.getType() != BaseOp2.getType()) 2372 return false; 2373 2374 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 2375 "Only base registers and frame indices are supported."); 2376 2377 // Check for both base regs and base FI. 2378 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 2379 return false; 2380 2381 // Only cluster up to a single pair. 2382 if (NumLoads > 1) 2383 return false; 2384 2385 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 2386 return false; 2387 2388 // Can we pair these instructions based on their opcodes? 2389 unsigned FirstOpc = FirstLdSt.getOpcode(); 2390 unsigned SecondOpc = SecondLdSt.getOpcode(); 2391 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 2392 return false; 2393 2394 // Can't merge volatiles or load/stores that have a hint to avoid pair 2395 // formation, for example. 2396 if (!isCandidateToMergeOrPair(FirstLdSt) || 2397 !isCandidateToMergeOrPair(SecondLdSt)) 2398 return false; 2399 2400 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 2401 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 2402 if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 2403 return false; 2404 2405 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 2406 if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 2407 return false; 2408 2409 // Pairwise instructions have a 7-bit signed offset field. 2410 if (Offset1 > 63 || Offset1 < -64) 2411 return false; 2412 2413 // The caller should already have ordered First/SecondLdSt by offset. 2414 // Note: except for non-equal frame index bases 2415 if (BaseOp1.isFI()) { 2416 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 2417 "Caller should have ordered offsets."); 2418 2419 const MachineFrameInfo &MFI = 2420 FirstLdSt.getParent()->getParent()->getFrameInfo(); 2421 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 2422 BaseOp2.getIndex(), Offset2, SecondOpc); 2423 } 2424 2425 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 2426 2427 return Offset1 + 1 == Offset2; 2428 } 2429 2430 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 2431 unsigned Reg, unsigned SubIdx, 2432 unsigned State, 2433 const TargetRegisterInfo *TRI) { 2434 if (!SubIdx) 2435 return MIB.addReg(Reg, State); 2436 2437 if (Register::isPhysicalRegister(Reg)) 2438 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 2439 return MIB.addReg(Reg, State, SubIdx); 2440 } 2441 2442 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 2443 unsigned NumRegs) { 2444 // We really want the positive remainder mod 32 here, that happens to be 2445 // easily obtainable with a mask. 2446 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 2447 } 2448 2449 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 2450 MachineBasicBlock::iterator I, 2451 const DebugLoc &DL, MCRegister DestReg, 2452 MCRegister SrcReg, bool KillSrc, 2453 unsigned Opcode, 2454 ArrayRef<unsigned> Indices) const { 2455 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 2456 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2457 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2458 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2459 unsigned NumRegs = Indices.size(); 2460 2461 int SubReg = 0, End = NumRegs, Incr = 1; 2462 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 2463 SubReg = NumRegs - 1; 2464 End = -1; 2465 Incr = -1; 2466 } 2467 2468 for (; SubReg != End; SubReg += Incr) { 2469 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2470 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2471 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 2472 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2473 } 2474 } 2475 2476 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 2477 MachineBasicBlock::iterator I, 2478 DebugLoc DL, unsigned DestReg, 2479 unsigned SrcReg, bool KillSrc, 2480 unsigned Opcode, unsigned ZeroReg, 2481 llvm::ArrayRef<unsigned> Indices) const { 2482 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2483 unsigned NumRegs = Indices.size(); 2484 2485 #ifndef NDEBUG 2486 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 2487 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 2488 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 2489 "GPR reg sequences should not be able to overlap"); 2490 #endif 2491 2492 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 2493 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 2494 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 2495 MIB.addReg(ZeroReg); 2496 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 2497 MIB.addImm(0); 2498 } 2499 } 2500 2501 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 2502 MachineBasicBlock::iterator I, 2503 const DebugLoc &DL, MCRegister DestReg, 2504 MCRegister SrcReg, bool KillSrc) const { 2505 if (AArch64::GPR32spRegClass.contains(DestReg) && 2506 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 2507 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2508 2509 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 2510 // If either operand is WSP, expand to ADD #0. 2511 if (Subtarget.hasZeroCycleRegMove()) { 2512 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 2513 MCRegister DestRegX = TRI->getMatchingSuperReg( 2514 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2515 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2516 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2517 // This instruction is reading and writing X registers. This may upset 2518 // the register scavenger and machine verifier, so we need to indicate 2519 // that we are reading an undefined value from SrcRegX, but a proper 2520 // value from SrcReg. 2521 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 2522 .addReg(SrcRegX, RegState::Undef) 2523 .addImm(0) 2524 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 2525 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2526 } else { 2527 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 2528 .addReg(SrcReg, getKillRegState(KillSrc)) 2529 .addImm(0) 2530 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2531 } 2532 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 2533 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 2534 .addImm(0) 2535 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2536 } else { 2537 if (Subtarget.hasZeroCycleRegMove()) { 2538 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 2539 MCRegister DestRegX = TRI->getMatchingSuperReg( 2540 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2541 MCRegister SrcRegX = TRI->getMatchingSuperReg( 2542 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 2543 // This instruction is reading and writing X registers. This may upset 2544 // the register scavenger and machine verifier, so we need to indicate 2545 // that we are reading an undefined value from SrcRegX, but a proper 2546 // value from SrcReg. 2547 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 2548 .addReg(AArch64::XZR) 2549 .addReg(SrcRegX, RegState::Undef) 2550 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 2551 } else { 2552 // Otherwise, expand to ORR WZR. 2553 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 2554 .addReg(AArch64::WZR) 2555 .addReg(SrcReg, getKillRegState(KillSrc)); 2556 } 2557 } 2558 return; 2559 } 2560 2561 // Copy a Predicate register by ORRing with itself. 2562 if (AArch64::PPRRegClass.contains(DestReg) && 2563 AArch64::PPRRegClass.contains(SrcReg)) { 2564 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2565 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 2566 .addReg(SrcReg) // Pg 2567 .addReg(SrcReg) 2568 .addReg(SrcReg, getKillRegState(KillSrc)); 2569 return; 2570 } 2571 2572 // Copy a Z register by ORRing with itself. 2573 if (AArch64::ZPRRegClass.contains(DestReg) && 2574 AArch64::ZPRRegClass.contains(SrcReg)) { 2575 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 2576 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 2577 .addReg(SrcReg) 2578 .addReg(SrcReg, getKillRegState(KillSrc)); 2579 return; 2580 } 2581 2582 if (AArch64::GPR64spRegClass.contains(DestReg) && 2583 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 2584 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 2585 // If either operand is SP, expand to ADD #0. 2586 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 2587 .addReg(SrcReg, getKillRegState(KillSrc)) 2588 .addImm(0) 2589 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2590 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 2591 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 2592 .addImm(0) 2593 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 2594 } else { 2595 // Otherwise, expand to ORR XZR. 2596 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 2597 .addReg(AArch64::XZR) 2598 .addReg(SrcReg, getKillRegState(KillSrc)); 2599 } 2600 return; 2601 } 2602 2603 // Copy a DDDD register quad by copying the individual sub-registers. 2604 if (AArch64::DDDDRegClass.contains(DestReg) && 2605 AArch64::DDDDRegClass.contains(SrcReg)) { 2606 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2607 AArch64::dsub2, AArch64::dsub3}; 2608 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2609 Indices); 2610 return; 2611 } 2612 2613 // Copy a DDD register triple by copying the individual sub-registers. 2614 if (AArch64::DDDRegClass.contains(DestReg) && 2615 AArch64::DDDRegClass.contains(SrcReg)) { 2616 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 2617 AArch64::dsub2}; 2618 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2619 Indices); 2620 return; 2621 } 2622 2623 // Copy a DD register pair by copying the individual sub-registers. 2624 if (AArch64::DDRegClass.contains(DestReg) && 2625 AArch64::DDRegClass.contains(SrcReg)) { 2626 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 2627 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 2628 Indices); 2629 return; 2630 } 2631 2632 // Copy a QQQQ register quad by copying the individual sub-registers. 2633 if (AArch64::QQQQRegClass.contains(DestReg) && 2634 AArch64::QQQQRegClass.contains(SrcReg)) { 2635 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2636 AArch64::qsub2, AArch64::qsub3}; 2637 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2638 Indices); 2639 return; 2640 } 2641 2642 // Copy a QQQ register triple by copying the individual sub-registers. 2643 if (AArch64::QQQRegClass.contains(DestReg) && 2644 AArch64::QQQRegClass.contains(SrcReg)) { 2645 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 2646 AArch64::qsub2}; 2647 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2648 Indices); 2649 return; 2650 } 2651 2652 // Copy a QQ register pair by copying the individual sub-registers. 2653 if (AArch64::QQRegClass.contains(DestReg) && 2654 AArch64::QQRegClass.contains(SrcReg)) { 2655 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 2656 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 2657 Indices); 2658 return; 2659 } 2660 2661 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 2662 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 2663 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 2664 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 2665 AArch64::XZR, Indices); 2666 return; 2667 } 2668 2669 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 2670 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 2671 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 2672 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 2673 AArch64::WZR, Indices); 2674 return; 2675 } 2676 2677 if (AArch64::FPR128RegClass.contains(DestReg) && 2678 AArch64::FPR128RegClass.contains(SrcReg)) { 2679 if (Subtarget.hasNEON()) { 2680 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2681 .addReg(SrcReg) 2682 .addReg(SrcReg, getKillRegState(KillSrc)); 2683 } else { 2684 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 2685 .addReg(AArch64::SP, RegState::Define) 2686 .addReg(SrcReg, getKillRegState(KillSrc)) 2687 .addReg(AArch64::SP) 2688 .addImm(-16); 2689 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 2690 .addReg(AArch64::SP, RegState::Define) 2691 .addReg(DestReg, RegState::Define) 2692 .addReg(AArch64::SP) 2693 .addImm(16); 2694 } 2695 return; 2696 } 2697 2698 if (AArch64::FPR64RegClass.contains(DestReg) && 2699 AArch64::FPR64RegClass.contains(SrcReg)) { 2700 if (Subtarget.hasNEON()) { 2701 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 2702 &AArch64::FPR128RegClass); 2703 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 2704 &AArch64::FPR128RegClass); 2705 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2706 .addReg(SrcReg) 2707 .addReg(SrcReg, getKillRegState(KillSrc)); 2708 } else { 2709 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 2710 .addReg(SrcReg, getKillRegState(KillSrc)); 2711 } 2712 return; 2713 } 2714 2715 if (AArch64::FPR32RegClass.contains(DestReg) && 2716 AArch64::FPR32RegClass.contains(SrcReg)) { 2717 if (Subtarget.hasNEON()) { 2718 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 2719 &AArch64::FPR128RegClass); 2720 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 2721 &AArch64::FPR128RegClass); 2722 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2723 .addReg(SrcReg) 2724 .addReg(SrcReg, getKillRegState(KillSrc)); 2725 } else { 2726 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2727 .addReg(SrcReg, getKillRegState(KillSrc)); 2728 } 2729 return; 2730 } 2731 2732 if (AArch64::FPR16RegClass.contains(DestReg) && 2733 AArch64::FPR16RegClass.contains(SrcReg)) { 2734 if (Subtarget.hasNEON()) { 2735 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2736 &AArch64::FPR128RegClass); 2737 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2738 &AArch64::FPR128RegClass); 2739 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2740 .addReg(SrcReg) 2741 .addReg(SrcReg, getKillRegState(KillSrc)); 2742 } else { 2743 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 2744 &AArch64::FPR32RegClass); 2745 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 2746 &AArch64::FPR32RegClass); 2747 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2748 .addReg(SrcReg, getKillRegState(KillSrc)); 2749 } 2750 return; 2751 } 2752 2753 if (AArch64::FPR8RegClass.contains(DestReg) && 2754 AArch64::FPR8RegClass.contains(SrcReg)) { 2755 if (Subtarget.hasNEON()) { 2756 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2757 &AArch64::FPR128RegClass); 2758 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2759 &AArch64::FPR128RegClass); 2760 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 2761 .addReg(SrcReg) 2762 .addReg(SrcReg, getKillRegState(KillSrc)); 2763 } else { 2764 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 2765 &AArch64::FPR32RegClass); 2766 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 2767 &AArch64::FPR32RegClass); 2768 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 2769 .addReg(SrcReg, getKillRegState(KillSrc)); 2770 } 2771 return; 2772 } 2773 2774 // Copies between GPR64 and FPR64. 2775 if (AArch64::FPR64RegClass.contains(DestReg) && 2776 AArch64::GPR64RegClass.contains(SrcReg)) { 2777 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 2778 .addReg(SrcReg, getKillRegState(KillSrc)); 2779 return; 2780 } 2781 if (AArch64::GPR64RegClass.contains(DestReg) && 2782 AArch64::FPR64RegClass.contains(SrcReg)) { 2783 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 2784 .addReg(SrcReg, getKillRegState(KillSrc)); 2785 return; 2786 } 2787 // Copies between GPR32 and FPR32. 2788 if (AArch64::FPR32RegClass.contains(DestReg) && 2789 AArch64::GPR32RegClass.contains(SrcReg)) { 2790 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 2791 .addReg(SrcReg, getKillRegState(KillSrc)); 2792 return; 2793 } 2794 if (AArch64::GPR32RegClass.contains(DestReg) && 2795 AArch64::FPR32RegClass.contains(SrcReg)) { 2796 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 2797 .addReg(SrcReg, getKillRegState(KillSrc)); 2798 return; 2799 } 2800 2801 if (DestReg == AArch64::NZCV) { 2802 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 2803 BuildMI(MBB, I, DL, get(AArch64::MSR)) 2804 .addImm(AArch64SysReg::NZCV) 2805 .addReg(SrcReg, getKillRegState(KillSrc)) 2806 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 2807 return; 2808 } 2809 2810 if (SrcReg == AArch64::NZCV) { 2811 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 2812 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 2813 .addImm(AArch64SysReg::NZCV) 2814 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 2815 return; 2816 } 2817 2818 llvm_unreachable("unimplemented reg-to-reg copy"); 2819 } 2820 2821 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 2822 MachineBasicBlock &MBB, 2823 MachineBasicBlock::iterator InsertBefore, 2824 const MCInstrDesc &MCID, 2825 unsigned SrcReg, bool IsKill, 2826 unsigned SubIdx0, unsigned SubIdx1, int FI, 2827 MachineMemOperand *MMO) { 2828 unsigned SrcReg0 = SrcReg; 2829 unsigned SrcReg1 = SrcReg; 2830 if (Register::isPhysicalRegister(SrcReg)) { 2831 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 2832 SubIdx0 = 0; 2833 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 2834 SubIdx1 = 0; 2835 } 2836 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 2837 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 2838 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 2839 .addFrameIndex(FI) 2840 .addImm(0) 2841 .addMemOperand(MMO); 2842 } 2843 2844 void AArch64InstrInfo::storeRegToStackSlot( 2845 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, 2846 bool isKill, int FI, const TargetRegisterClass *RC, 2847 const TargetRegisterInfo *TRI) const { 2848 MachineFunction &MF = *MBB.getParent(); 2849 MachineFrameInfo &MFI = MF.getFrameInfo(); 2850 unsigned Align = MFI.getObjectAlignment(FI); 2851 2852 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 2853 MachineMemOperand *MMO = MF.getMachineMemOperand( 2854 PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); 2855 unsigned Opc = 0; 2856 bool Offset = true; 2857 switch (TRI->getSpillSize(*RC)) { 2858 case 1: 2859 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 2860 Opc = AArch64::STRBui; 2861 break; 2862 case 2: 2863 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 2864 Opc = AArch64::STRHui; 2865 break; 2866 case 4: 2867 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 2868 Opc = AArch64::STRWui; 2869 if (Register::isVirtualRegister(SrcReg)) 2870 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 2871 else 2872 assert(SrcReg != AArch64::WSP); 2873 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 2874 Opc = AArch64::STRSui; 2875 break; 2876 case 8: 2877 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 2878 Opc = AArch64::STRXui; 2879 if (Register::isVirtualRegister(SrcReg)) 2880 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 2881 else 2882 assert(SrcReg != AArch64::SP); 2883 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 2884 Opc = AArch64::STRDui; 2885 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 2886 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2887 get(AArch64::STPWi), SrcReg, isKill, 2888 AArch64::sube32, AArch64::subo32, FI, MMO); 2889 return; 2890 } 2891 break; 2892 case 16: 2893 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 2894 Opc = AArch64::STRQui; 2895 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 2896 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2897 Opc = AArch64::ST1Twov1d; 2898 Offset = false; 2899 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 2900 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 2901 get(AArch64::STPXi), SrcReg, isKill, 2902 AArch64::sube64, AArch64::subo64, FI, MMO); 2903 return; 2904 } 2905 break; 2906 case 24: 2907 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 2908 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2909 Opc = AArch64::ST1Threev1d; 2910 Offset = false; 2911 } 2912 break; 2913 case 32: 2914 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 2915 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2916 Opc = AArch64::ST1Fourv1d; 2917 Offset = false; 2918 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 2919 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2920 Opc = AArch64::ST1Twov2d; 2921 Offset = false; 2922 } 2923 break; 2924 case 48: 2925 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 2926 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2927 Opc = AArch64::ST1Threev2d; 2928 Offset = false; 2929 } 2930 break; 2931 case 64: 2932 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 2933 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 2934 Opc = AArch64::ST1Fourv2d; 2935 Offset = false; 2936 } 2937 break; 2938 } 2939 unsigned StackID = TargetStackID::Default; 2940 if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 2941 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 2942 Opc = AArch64::STR_PXI; 2943 StackID = TargetStackID::SVEVector; 2944 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 2945 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 2946 Opc = AArch64::STR_ZXI; 2947 StackID = TargetStackID::SVEVector; 2948 } 2949 assert(Opc && "Unknown register class"); 2950 MFI.setStackID(FI, StackID); 2951 2952 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 2953 .addReg(SrcReg, getKillRegState(isKill)) 2954 .addFrameIndex(FI); 2955 2956 if (Offset) 2957 MI.addImm(0); 2958 MI.addMemOperand(MMO); 2959 } 2960 2961 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 2962 MachineBasicBlock &MBB, 2963 MachineBasicBlock::iterator InsertBefore, 2964 const MCInstrDesc &MCID, 2965 unsigned DestReg, unsigned SubIdx0, 2966 unsigned SubIdx1, int FI, 2967 MachineMemOperand *MMO) { 2968 unsigned DestReg0 = DestReg; 2969 unsigned DestReg1 = DestReg; 2970 bool IsUndef = true; 2971 if (Register::isPhysicalRegister(DestReg)) { 2972 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 2973 SubIdx0 = 0; 2974 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 2975 SubIdx1 = 0; 2976 IsUndef = false; 2977 } 2978 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 2979 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 2980 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 2981 .addFrameIndex(FI) 2982 .addImm(0) 2983 .addMemOperand(MMO); 2984 } 2985 2986 void AArch64InstrInfo::loadRegFromStackSlot( 2987 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, 2988 int FI, const TargetRegisterClass *RC, 2989 const TargetRegisterInfo *TRI) const { 2990 MachineFunction &MF = *MBB.getParent(); 2991 MachineFrameInfo &MFI = MF.getFrameInfo(); 2992 unsigned Align = MFI.getObjectAlignment(FI); 2993 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 2994 MachineMemOperand *MMO = MF.getMachineMemOperand( 2995 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); 2996 2997 unsigned Opc = 0; 2998 bool Offset = true; 2999 switch (TRI->getSpillSize(*RC)) { 3000 case 1: 3001 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3002 Opc = AArch64::LDRBui; 3003 break; 3004 case 2: 3005 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3006 Opc = AArch64::LDRHui; 3007 break; 3008 case 4: 3009 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3010 Opc = AArch64::LDRWui; 3011 if (Register::isVirtualRegister(DestReg)) 3012 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3013 else 3014 assert(DestReg != AArch64::WSP); 3015 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3016 Opc = AArch64::LDRSui; 3017 break; 3018 case 8: 3019 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3020 Opc = AArch64::LDRXui; 3021 if (Register::isVirtualRegister(DestReg)) 3022 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3023 else 3024 assert(DestReg != AArch64::SP); 3025 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3026 Opc = AArch64::LDRDui; 3027 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3028 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3029 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3030 AArch64::subo32, FI, MMO); 3031 return; 3032 } 3033 break; 3034 case 16: 3035 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3036 Opc = AArch64::LDRQui; 3037 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3038 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3039 Opc = AArch64::LD1Twov1d; 3040 Offset = false; 3041 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3042 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3043 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3044 AArch64::subo64, FI, MMO); 3045 return; 3046 } 3047 break; 3048 case 24: 3049 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3050 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3051 Opc = AArch64::LD1Threev1d; 3052 Offset = false; 3053 } 3054 break; 3055 case 32: 3056 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3057 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3058 Opc = AArch64::LD1Fourv1d; 3059 Offset = false; 3060 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3061 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3062 Opc = AArch64::LD1Twov2d; 3063 Offset = false; 3064 } 3065 break; 3066 case 48: 3067 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3068 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3069 Opc = AArch64::LD1Threev2d; 3070 Offset = false; 3071 } 3072 break; 3073 case 64: 3074 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3075 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3076 Opc = AArch64::LD1Fourv2d; 3077 Offset = false; 3078 } 3079 break; 3080 } 3081 3082 unsigned StackID = TargetStackID::Default; 3083 if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3084 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3085 Opc = AArch64::LDR_PXI; 3086 StackID = TargetStackID::SVEVector; 3087 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3088 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3089 Opc = AArch64::LDR_ZXI; 3090 StackID = TargetStackID::SVEVector; 3091 } 3092 assert(Opc && "Unknown register class"); 3093 MFI.setStackID(FI, StackID); 3094 3095 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3096 .addReg(DestReg, getDefRegState(true)) 3097 .addFrameIndex(FI); 3098 if (Offset) 3099 MI.addImm(0); 3100 MI.addMemOperand(MMO); 3101 } 3102 3103 // Helper function to emit a frame offset adjustment from a given 3104 // pointer (SrcReg), stored into DestReg. This function is explicit 3105 // in that it requires the opcode. 3106 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3107 MachineBasicBlock::iterator MBBI, 3108 const DebugLoc &DL, unsigned DestReg, 3109 unsigned SrcReg, int64_t Offset, unsigned Opc, 3110 const TargetInstrInfo *TII, 3111 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3112 bool *HasWinCFI) { 3113 int Sign = 1; 3114 unsigned MaxEncoding, ShiftSize; 3115 switch (Opc) { 3116 case AArch64::ADDXri: 3117 case AArch64::ADDSXri: 3118 case AArch64::SUBXri: 3119 case AArch64::SUBSXri: 3120 MaxEncoding = 0xfff; 3121 ShiftSize = 12; 3122 break; 3123 case AArch64::ADDVL_XXI: 3124 case AArch64::ADDPL_XXI: 3125 MaxEncoding = 31; 3126 ShiftSize = 0; 3127 if (Offset < 0) { 3128 MaxEncoding = 32; 3129 Sign = -1; 3130 Offset = -Offset; 3131 } 3132 break; 3133 default: 3134 llvm_unreachable("Unsupported opcode"); 3135 } 3136 3137 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3138 // scratch register. If DestReg is a virtual register, use it as the 3139 // scratch register; otherwise, create a new virtual register (to be 3140 // replaced by the scavenger at the end of PEI). That case can be optimized 3141 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3142 // register can be loaded with offset%8 and the add/sub can use an extending 3143 // instruction with LSL#3. 3144 // Currently the function handles any offsets but generates a poor sequence 3145 // of code. 3146 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3147 3148 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3149 do { 3150 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 3151 unsigned LocalShiftSize = 0; 3152 if (ThisVal > MaxEncoding) { 3153 ThisVal = ThisVal >> ShiftSize; 3154 LocalShiftSize = ShiftSize; 3155 } 3156 assert((ThisVal >> ShiftSize) <= MaxEncoding && 3157 "Encoding cannot handle value that big"); 3158 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) 3159 .addReg(SrcReg) 3160 .addImm(Sign * (int)ThisVal); 3161 if (ShiftSize) 3162 MBI = MBI.addImm( 3163 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 3164 MBI = MBI.setMIFlag(Flag); 3165 3166 if (NeedsWinCFI) { 3167 assert(Sign == 1 && "SEH directives should always have a positive sign"); 3168 int Imm = (int)(ThisVal << LocalShiftSize); 3169 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 3170 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 3171 if (HasWinCFI) 3172 *HasWinCFI = true; 3173 if (Imm == 0) 3174 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 3175 else 3176 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 3177 .addImm(Imm) 3178 .setMIFlag(Flag); 3179 assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to " 3180 "emit a single SEH directive"); 3181 } else if (DestReg == AArch64::SP) { 3182 if (HasWinCFI) 3183 *HasWinCFI = true; 3184 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 3185 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 3186 .addImm(Imm) 3187 .setMIFlag(Flag); 3188 } 3189 if (HasWinCFI) 3190 *HasWinCFI = true; 3191 } 3192 3193 SrcReg = DestReg; 3194 Offset -= ThisVal << LocalShiftSize; 3195 } while (Offset); 3196 } 3197 3198 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 3199 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 3200 unsigned DestReg, unsigned SrcReg, 3201 StackOffset Offset, const TargetInstrInfo *TII, 3202 MachineInstr::MIFlag Flag, bool SetNZCV, 3203 bool NeedsWinCFI, bool *HasWinCFI) { 3204 int64_t Bytes, NumPredicateVectors, NumDataVectors; 3205 Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); 3206 3207 // First emit non-scalable frame offsets, or a simple 'mov'. 3208 if (Bytes || (!Offset && SrcReg != DestReg)) { 3209 assert((DestReg != AArch64::SP || Bytes % 16 == 0) && 3210 "SP increment/decrement not 16-byte aligned"); 3211 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 3212 if (Bytes < 0) { 3213 Bytes = -Bytes; 3214 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 3215 } 3216 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 3217 NeedsWinCFI, HasWinCFI); 3218 SrcReg = DestReg; 3219 } 3220 3221 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 3222 "SetNZCV not supported with SVE vectors"); 3223 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 3224 "WinCFI not supported with SVE vectors"); 3225 3226 if (NumDataVectors) { 3227 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 3228 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3229 SrcReg = DestReg; 3230 } 3231 3232 if (NumPredicateVectors) { 3233 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 3234 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 3235 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 3236 } 3237 } 3238 3239 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 3240 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 3241 MachineBasicBlock::iterator InsertPt, int FrameIndex, 3242 LiveIntervals *LIS, VirtRegMap *VRM) const { 3243 // This is a bit of a hack. Consider this instruction: 3244 // 3245 // %0 = COPY %sp; GPR64all:%0 3246 // 3247 // We explicitly chose GPR64all for the virtual register so such a copy might 3248 // be eliminated by RegisterCoalescer. However, that may not be possible, and 3249 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 3250 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 3251 // 3252 // To prevent that, we are going to constrain the %0 register class here. 3253 // 3254 // <rdar://problem/11522048> 3255 // 3256 if (MI.isFullCopy()) { 3257 Register DstReg = MI.getOperand(0).getReg(); 3258 Register SrcReg = MI.getOperand(1).getReg(); 3259 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 3260 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 3261 return nullptr; 3262 } 3263 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 3264 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3265 return nullptr; 3266 } 3267 } 3268 3269 // Handle the case where a copy is being spilled or filled but the source 3270 // and destination register class don't match. For example: 3271 // 3272 // %0 = COPY %xzr; GPR64common:%0 3273 // 3274 // In this case we can still safely fold away the COPY and generate the 3275 // following spill code: 3276 // 3277 // STRXui %xzr, %stack.0 3278 // 3279 // This also eliminates spilled cross register class COPYs (e.g. between x and 3280 // d regs) of the same size. For example: 3281 // 3282 // %0 = COPY %1; GPR64:%0, FPR64:%1 3283 // 3284 // will be filled as 3285 // 3286 // LDRDui %0, fi<#0> 3287 // 3288 // instead of 3289 // 3290 // LDRXui %Temp, fi<#0> 3291 // %0 = FMOV %Temp 3292 // 3293 if (MI.isCopy() && Ops.size() == 1 && 3294 // Make sure we're only folding the explicit COPY defs/uses. 3295 (Ops[0] == 0 || Ops[0] == 1)) { 3296 bool IsSpill = Ops[0] == 0; 3297 bool IsFill = !IsSpill; 3298 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 3299 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3300 MachineBasicBlock &MBB = *MI.getParent(); 3301 const MachineOperand &DstMO = MI.getOperand(0); 3302 const MachineOperand &SrcMO = MI.getOperand(1); 3303 Register DstReg = DstMO.getReg(); 3304 Register SrcReg = SrcMO.getReg(); 3305 // This is slightly expensive to compute for physical regs since 3306 // getMinimalPhysRegClass is slow. 3307 auto getRegClass = [&](unsigned Reg) { 3308 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 3309 : TRI.getMinimalPhysRegClass(Reg); 3310 }; 3311 3312 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 3313 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 3314 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 3315 "Mismatched register size in non subreg COPY"); 3316 if (IsSpill) 3317 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 3318 getRegClass(SrcReg), &TRI); 3319 else 3320 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 3321 getRegClass(DstReg), &TRI); 3322 return &*--InsertPt; 3323 } 3324 3325 // Handle cases like spilling def of: 3326 // 3327 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 3328 // 3329 // where the physical register source can be widened and stored to the full 3330 // virtual reg destination stack slot, in this case producing: 3331 // 3332 // STRXui %xzr, %stack.0 3333 // 3334 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 3335 assert(SrcMO.getSubReg() == 0 && 3336 "Unexpected subreg on physical register"); 3337 const TargetRegisterClass *SpillRC; 3338 unsigned SpillSubreg; 3339 switch (DstMO.getSubReg()) { 3340 default: 3341 SpillRC = nullptr; 3342 break; 3343 case AArch64::sub_32: 3344 case AArch64::ssub: 3345 if (AArch64::GPR32RegClass.contains(SrcReg)) { 3346 SpillRC = &AArch64::GPR64RegClass; 3347 SpillSubreg = AArch64::sub_32; 3348 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 3349 SpillRC = &AArch64::FPR64RegClass; 3350 SpillSubreg = AArch64::ssub; 3351 } else 3352 SpillRC = nullptr; 3353 break; 3354 case AArch64::dsub: 3355 if (AArch64::FPR64RegClass.contains(SrcReg)) { 3356 SpillRC = &AArch64::FPR128RegClass; 3357 SpillSubreg = AArch64::dsub; 3358 } else 3359 SpillRC = nullptr; 3360 break; 3361 } 3362 3363 if (SpillRC) 3364 if (unsigned WidenedSrcReg = 3365 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 3366 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 3367 FrameIndex, SpillRC, &TRI); 3368 return &*--InsertPt; 3369 } 3370 } 3371 3372 // Handle cases like filling use of: 3373 // 3374 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 3375 // 3376 // where we can load the full virtual reg source stack slot, into the subreg 3377 // destination, in this case producing: 3378 // 3379 // LDRWui %0:sub_32<def,read-undef>, %stack.0 3380 // 3381 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 3382 const TargetRegisterClass *FillRC; 3383 switch (DstMO.getSubReg()) { 3384 default: 3385 FillRC = nullptr; 3386 break; 3387 case AArch64::sub_32: 3388 FillRC = &AArch64::GPR32RegClass; 3389 break; 3390 case AArch64::ssub: 3391 FillRC = &AArch64::FPR32RegClass; 3392 break; 3393 case AArch64::dsub: 3394 FillRC = &AArch64::FPR64RegClass; 3395 break; 3396 } 3397 3398 if (FillRC) { 3399 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 3400 TRI.getRegSizeInBits(*FillRC) && 3401 "Mismatched regclass size on folded subreg COPY"); 3402 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 3403 MachineInstr &LoadMI = *--InsertPt; 3404 MachineOperand &LoadDst = LoadMI.getOperand(0); 3405 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 3406 LoadDst.setSubReg(DstMO.getSubReg()); 3407 LoadDst.setIsUndef(); 3408 return &LoadMI; 3409 } 3410 } 3411 } 3412 3413 // Cannot fold. 3414 return nullptr; 3415 } 3416 3417 static bool isSVEScaledImmInstruction(unsigned Opcode) { 3418 switch (Opcode) { 3419 case AArch64::LDR_ZXI: 3420 case AArch64::STR_ZXI: 3421 case AArch64::LDR_PXI: 3422 case AArch64::STR_PXI: 3423 return true; 3424 default: 3425 return false; 3426 } 3427 } 3428 3429 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 3430 StackOffset &SOffset, 3431 bool *OutUseUnscaledOp, 3432 unsigned *OutUnscaledOp, 3433 int64_t *EmittableOffset) { 3434 // Set output values in case of early exit. 3435 if (EmittableOffset) 3436 *EmittableOffset = 0; 3437 if (OutUseUnscaledOp) 3438 *OutUseUnscaledOp = false; 3439 if (OutUnscaledOp) 3440 *OutUnscaledOp = 0; 3441 3442 // Exit early for structured vector spills/fills as they can't take an 3443 // immediate offset. 3444 switch (MI.getOpcode()) { 3445 default: 3446 break; 3447 case AArch64::LD1Twov2d: 3448 case AArch64::LD1Threev2d: 3449 case AArch64::LD1Fourv2d: 3450 case AArch64::LD1Twov1d: 3451 case AArch64::LD1Threev1d: 3452 case AArch64::LD1Fourv1d: 3453 case AArch64::ST1Twov2d: 3454 case AArch64::ST1Threev2d: 3455 case AArch64::ST1Fourv2d: 3456 case AArch64::ST1Twov1d: 3457 case AArch64::ST1Threev1d: 3458 case AArch64::ST1Fourv1d: 3459 case AArch64::IRG: 3460 case AArch64::IRGstack: 3461 return AArch64FrameOffsetCannotUpdate; 3462 } 3463 3464 // Get the min/max offset and the scale. 3465 unsigned Scale, Width; 3466 int64_t MinOff, MaxOff; 3467 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff, 3468 MaxOff)) 3469 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3470 3471 // Construct the complete offset. 3472 bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode()); 3473 int64_t Offset = 3474 IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes()); 3475 3476 const MachineOperand &ImmOpnd = 3477 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 3478 Offset += ImmOpnd.getImm() * Scale; 3479 3480 // If the offset doesn't match the scale, we rewrite the instruction to 3481 // use the unscaled instruction instead. Likewise, if we have a negative 3482 // offset and there is an unscaled op to use. 3483 Optional<unsigned> UnscaledOp = 3484 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 3485 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 3486 if (useUnscaledOp && 3487 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff)) 3488 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 3489 3490 int64_t Remainder = Offset % Scale; 3491 assert(!(Remainder && useUnscaledOp) && 3492 "Cannot have remainder when using unscaled op"); 3493 3494 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 3495 int64_t NewOffset = Offset / Scale; 3496 if (MinOff <= NewOffset && NewOffset <= MaxOff) 3497 Offset = Remainder; 3498 else { 3499 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 3500 Offset = Offset - NewOffset * Scale + Remainder; 3501 } 3502 3503 if (EmittableOffset) 3504 *EmittableOffset = NewOffset; 3505 if (OutUseUnscaledOp) 3506 *OutUseUnscaledOp = useUnscaledOp; 3507 if (OutUnscaledOp && UnscaledOp) 3508 *OutUnscaledOp = *UnscaledOp; 3509 3510 if (IsMulVL) 3511 SOffset = StackOffset(Offset, MVT::nxv1i8) + 3512 StackOffset(SOffset.getBytes(), MVT::i8); 3513 else 3514 SOffset = StackOffset(Offset, MVT::i8) + 3515 StackOffset(SOffset.getScalableBytes(), MVT::nxv1i8); 3516 return AArch64FrameOffsetCanUpdate | 3517 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 3518 } 3519 3520 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 3521 unsigned FrameReg, StackOffset &Offset, 3522 const AArch64InstrInfo *TII) { 3523 unsigned Opcode = MI.getOpcode(); 3524 unsigned ImmIdx = FrameRegIdx + 1; 3525 3526 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 3527 Offset += StackOffset(MI.getOperand(ImmIdx).getImm(), MVT::i8); 3528 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 3529 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 3530 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 3531 MI.eraseFromParent(); 3532 Offset = StackOffset(); 3533 return true; 3534 } 3535 3536 int64_t NewOffset; 3537 unsigned UnscaledOp; 3538 bool UseUnscaledOp; 3539 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 3540 &UnscaledOp, &NewOffset); 3541 if (Status & AArch64FrameOffsetCanUpdate) { 3542 if (Status & AArch64FrameOffsetIsLegal) 3543 // Replace the FrameIndex with FrameReg. 3544 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 3545 if (UseUnscaledOp) 3546 MI.setDesc(TII->get(UnscaledOp)); 3547 3548 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 3549 return !Offset; 3550 } 3551 3552 return false; 3553 } 3554 3555 void AArch64InstrInfo::getNoop(MCInst &NopInst) const { 3556 NopInst.setOpcode(AArch64::HINT); 3557 NopInst.addOperand(MCOperand::createImm(0)); 3558 } 3559 3560 // AArch64 supports MachineCombiner. 3561 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 3562 3563 // True when Opc sets flag 3564 static bool isCombineInstrSettingFlag(unsigned Opc) { 3565 switch (Opc) { 3566 case AArch64::ADDSWrr: 3567 case AArch64::ADDSWri: 3568 case AArch64::ADDSXrr: 3569 case AArch64::ADDSXri: 3570 case AArch64::SUBSWrr: 3571 case AArch64::SUBSXrr: 3572 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3573 case AArch64::SUBSWri: 3574 case AArch64::SUBSXri: 3575 return true; 3576 default: 3577 break; 3578 } 3579 return false; 3580 } 3581 3582 // 32b Opcodes that can be combined with a MUL 3583 static bool isCombineInstrCandidate32(unsigned Opc) { 3584 switch (Opc) { 3585 case AArch64::ADDWrr: 3586 case AArch64::ADDWri: 3587 case AArch64::SUBWrr: 3588 case AArch64::ADDSWrr: 3589 case AArch64::ADDSWri: 3590 case AArch64::SUBSWrr: 3591 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3592 case AArch64::SUBWri: 3593 case AArch64::SUBSWri: 3594 return true; 3595 default: 3596 break; 3597 } 3598 return false; 3599 } 3600 3601 // 64b Opcodes that can be combined with a MUL 3602 static bool isCombineInstrCandidate64(unsigned Opc) { 3603 switch (Opc) { 3604 case AArch64::ADDXrr: 3605 case AArch64::ADDXri: 3606 case AArch64::SUBXrr: 3607 case AArch64::ADDSXrr: 3608 case AArch64::ADDSXri: 3609 case AArch64::SUBSXrr: 3610 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 3611 case AArch64::SUBXri: 3612 case AArch64::SUBSXri: 3613 case AArch64::ADDv8i8: 3614 case AArch64::ADDv16i8: 3615 case AArch64::ADDv4i16: 3616 case AArch64::ADDv8i16: 3617 case AArch64::ADDv2i32: 3618 case AArch64::ADDv4i32: 3619 case AArch64::SUBv8i8: 3620 case AArch64::SUBv16i8: 3621 case AArch64::SUBv4i16: 3622 case AArch64::SUBv8i16: 3623 case AArch64::SUBv2i32: 3624 case AArch64::SUBv4i32: 3625 return true; 3626 default: 3627 break; 3628 } 3629 return false; 3630 } 3631 3632 // FP Opcodes that can be combined with a FMUL 3633 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 3634 switch (Inst.getOpcode()) { 3635 default: 3636 break; 3637 case AArch64::FADDHrr: 3638 case AArch64::FADDSrr: 3639 case AArch64::FADDDrr: 3640 case AArch64::FADDv4f16: 3641 case AArch64::FADDv8f16: 3642 case AArch64::FADDv2f32: 3643 case AArch64::FADDv2f64: 3644 case AArch64::FADDv4f32: 3645 case AArch64::FSUBHrr: 3646 case AArch64::FSUBSrr: 3647 case AArch64::FSUBDrr: 3648 case AArch64::FSUBv4f16: 3649 case AArch64::FSUBv8f16: 3650 case AArch64::FSUBv2f32: 3651 case AArch64::FSUBv2f64: 3652 case AArch64::FSUBv4f32: 3653 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 3654 return (Options.UnsafeFPMath || 3655 Options.AllowFPOpFusion == FPOpFusion::Fast); 3656 } 3657 return false; 3658 } 3659 3660 // Opcodes that can be combined with a MUL 3661 static bool isCombineInstrCandidate(unsigned Opc) { 3662 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 3663 } 3664 3665 // 3666 // Utility routine that checks if \param MO is defined by an 3667 // \param CombineOpc instruction in the basic block \param MBB 3668 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 3669 unsigned CombineOpc, unsigned ZeroReg = 0, 3670 bool CheckZeroReg = false) { 3671 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3672 MachineInstr *MI = nullptr; 3673 3674 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 3675 MI = MRI.getUniqueVRegDef(MO.getReg()); 3676 // And it needs to be in the trace (otherwise, it won't have a depth). 3677 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 3678 return false; 3679 // Must only used by the user we combine with. 3680 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 3681 return false; 3682 3683 if (CheckZeroReg) { 3684 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 3685 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 3686 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 3687 // The third input reg must be zero. 3688 if (MI->getOperand(3).getReg() != ZeroReg) 3689 return false; 3690 } 3691 3692 return true; 3693 } 3694 3695 // 3696 // Is \param MO defined by an integer multiply and can be combined? 3697 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3698 unsigned MulOpc, unsigned ZeroReg) { 3699 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 3700 } 3701 3702 // 3703 // Is \param MO defined by a floating-point multiply and can be combined? 3704 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 3705 unsigned MulOpc) { 3706 return canCombine(MBB, MO, MulOpc); 3707 } 3708 3709 // TODO: There are many more machine instruction opcodes to match: 3710 // 1. Other data types (integer, vectors) 3711 // 2. Other math / logic operations (xor, or) 3712 // 3. Other forms of the same operation (intrinsics and other variants) 3713 bool AArch64InstrInfo::isAssociativeAndCommutative( 3714 const MachineInstr &Inst) const { 3715 switch (Inst.getOpcode()) { 3716 case AArch64::FADDDrr: 3717 case AArch64::FADDSrr: 3718 case AArch64::FADDv2f32: 3719 case AArch64::FADDv2f64: 3720 case AArch64::FADDv4f32: 3721 case AArch64::FMULDrr: 3722 case AArch64::FMULSrr: 3723 case AArch64::FMULX32: 3724 case AArch64::FMULX64: 3725 case AArch64::FMULXv2f32: 3726 case AArch64::FMULXv2f64: 3727 case AArch64::FMULXv4f32: 3728 case AArch64::FMULv2f32: 3729 case AArch64::FMULv2f64: 3730 case AArch64::FMULv4f32: 3731 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 3732 default: 3733 return false; 3734 } 3735 } 3736 3737 /// Find instructions that can be turned into madd. 3738 static bool getMaddPatterns(MachineInstr &Root, 3739 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3740 unsigned Opc = Root.getOpcode(); 3741 MachineBasicBlock &MBB = *Root.getParent(); 3742 bool Found = false; 3743 3744 if (!isCombineInstrCandidate(Opc)) 3745 return false; 3746 if (isCombineInstrSettingFlag(Opc)) { 3747 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 3748 // When NZCV is live bail out. 3749 if (Cmp_NZCV == -1) 3750 return false; 3751 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 3752 // When opcode can't change bail out. 3753 // CHECKME: do we miss any cases for opcode conversion? 3754 if (NewOpc == Opc) 3755 return false; 3756 Opc = NewOpc; 3757 } 3758 3759 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 3760 MachineCombinerPattern Pattern) { 3761 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 3762 Patterns.push_back(Pattern); 3763 Found = true; 3764 } 3765 }; 3766 3767 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 3768 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 3769 Patterns.push_back(Pattern); 3770 Found = true; 3771 } 3772 }; 3773 3774 typedef MachineCombinerPattern MCP; 3775 3776 switch (Opc) { 3777 default: 3778 break; 3779 case AArch64::ADDWrr: 3780 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3781 "ADDWrr does not have register operands"); 3782 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 3783 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 3784 break; 3785 case AArch64::ADDXrr: 3786 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 3787 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 3788 break; 3789 case AArch64::SUBWrr: 3790 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 3791 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 3792 break; 3793 case AArch64::SUBXrr: 3794 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 3795 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 3796 break; 3797 case AArch64::ADDWri: 3798 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 3799 break; 3800 case AArch64::ADDXri: 3801 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 3802 break; 3803 case AArch64::SUBWri: 3804 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 3805 break; 3806 case AArch64::SUBXri: 3807 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 3808 break; 3809 case AArch64::ADDv8i8: 3810 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 3811 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 3812 break; 3813 case AArch64::ADDv16i8: 3814 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 3815 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 3816 break; 3817 case AArch64::ADDv4i16: 3818 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 3819 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 3820 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 3821 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 3822 break; 3823 case AArch64::ADDv8i16: 3824 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 3825 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 3826 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 3827 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 3828 break; 3829 case AArch64::ADDv2i32: 3830 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 3831 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 3832 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 3833 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 3834 break; 3835 case AArch64::ADDv4i32: 3836 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 3837 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 3838 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 3839 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 3840 break; 3841 case AArch64::SUBv8i8: 3842 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 3843 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 3844 break; 3845 case AArch64::SUBv16i8: 3846 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 3847 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 3848 break; 3849 case AArch64::SUBv4i16: 3850 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 3851 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 3852 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 3853 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 3854 break; 3855 case AArch64::SUBv8i16: 3856 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 3857 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 3858 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 3859 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 3860 break; 3861 case AArch64::SUBv2i32: 3862 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 3863 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 3864 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 3865 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 3866 break; 3867 case AArch64::SUBv4i32: 3868 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 3869 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 3870 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 3871 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 3872 break; 3873 } 3874 return Found; 3875 } 3876 /// Floating-Point Support 3877 3878 /// Find instructions that can be turned into madd. 3879 static bool getFMAPatterns(MachineInstr &Root, 3880 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 3881 3882 if (!isCombineInstrCandidateFP(Root)) 3883 return false; 3884 3885 MachineBasicBlock &MBB = *Root.getParent(); 3886 bool Found = false; 3887 3888 auto Match = [&](int Opcode, int Operand, 3889 MachineCombinerPattern Pattern) -> bool { 3890 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 3891 Patterns.push_back(Pattern); 3892 return true; 3893 } 3894 return false; 3895 }; 3896 3897 typedef MachineCombinerPattern MCP; 3898 3899 switch (Root.getOpcode()) { 3900 default: 3901 assert(false && "Unsupported FP instruction in combiner\n"); 3902 break; 3903 case AArch64::FADDHrr: 3904 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3905 "FADDHrr does not have register operands"); 3906 3907 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 3908 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 3909 break; 3910 case AArch64::FADDSrr: 3911 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 3912 "FADDSrr does not have register operands"); 3913 3914 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 3915 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 3916 3917 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 3918 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 3919 break; 3920 case AArch64::FADDDrr: 3921 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 3922 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 3923 3924 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 3925 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 3926 break; 3927 case AArch64::FADDv4f16: 3928 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 3929 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 3930 3931 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 3932 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 3933 break; 3934 case AArch64::FADDv8f16: 3935 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 3936 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 3937 3938 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 3939 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 3940 break; 3941 case AArch64::FADDv2f32: 3942 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 3943 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 3944 3945 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 3946 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 3947 break; 3948 case AArch64::FADDv2f64: 3949 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 3950 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 3951 3952 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 3953 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 3954 break; 3955 case AArch64::FADDv4f32: 3956 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 3957 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 3958 3959 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 3960 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 3961 break; 3962 case AArch64::FSUBHrr: 3963 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 3964 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 3965 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 3966 break; 3967 case AArch64::FSUBSrr: 3968 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 3969 3970 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 3971 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 3972 3973 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 3974 break; 3975 case AArch64::FSUBDrr: 3976 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 3977 3978 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 3979 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 3980 3981 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 3982 break; 3983 case AArch64::FSUBv4f16: 3984 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 3985 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 3986 3987 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 3988 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 3989 break; 3990 case AArch64::FSUBv8f16: 3991 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 3992 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 3993 3994 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 3995 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 3996 break; 3997 case AArch64::FSUBv2f32: 3998 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 3999 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4000 4001 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4002 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4003 break; 4004 case AArch64::FSUBv2f64: 4005 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4006 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4007 4008 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4009 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4010 break; 4011 case AArch64::FSUBv4f32: 4012 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4013 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4014 4015 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4016 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4017 break; 4018 } 4019 return Found; 4020 } 4021 4022 /// Return true when a code sequence can improve throughput. It 4023 /// should be called only for instructions in loops. 4024 /// \param Pattern - combiner pattern 4025 bool AArch64InstrInfo::isThroughputPattern( 4026 MachineCombinerPattern Pattern) const { 4027 switch (Pattern) { 4028 default: 4029 break; 4030 case MachineCombinerPattern::FMULADDH_OP1: 4031 case MachineCombinerPattern::FMULADDH_OP2: 4032 case MachineCombinerPattern::FMULSUBH_OP1: 4033 case MachineCombinerPattern::FMULSUBH_OP2: 4034 case MachineCombinerPattern::FMULADDS_OP1: 4035 case MachineCombinerPattern::FMULADDS_OP2: 4036 case MachineCombinerPattern::FMULSUBS_OP1: 4037 case MachineCombinerPattern::FMULSUBS_OP2: 4038 case MachineCombinerPattern::FMULADDD_OP1: 4039 case MachineCombinerPattern::FMULADDD_OP2: 4040 case MachineCombinerPattern::FMULSUBD_OP1: 4041 case MachineCombinerPattern::FMULSUBD_OP2: 4042 case MachineCombinerPattern::FNMULSUBH_OP1: 4043 case MachineCombinerPattern::FNMULSUBS_OP1: 4044 case MachineCombinerPattern::FNMULSUBD_OP1: 4045 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4046 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4047 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4048 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4049 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4050 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4051 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4052 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4053 case MachineCombinerPattern::FMLAv4f16_OP2: 4054 case MachineCombinerPattern::FMLAv4f16_OP1: 4055 case MachineCombinerPattern::FMLAv8f16_OP1: 4056 case MachineCombinerPattern::FMLAv8f16_OP2: 4057 case MachineCombinerPattern::FMLAv2f32_OP2: 4058 case MachineCombinerPattern::FMLAv2f32_OP1: 4059 case MachineCombinerPattern::FMLAv2f64_OP1: 4060 case MachineCombinerPattern::FMLAv2f64_OP2: 4061 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4062 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4063 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4064 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4065 case MachineCombinerPattern::FMLAv4f32_OP1: 4066 case MachineCombinerPattern::FMLAv4f32_OP2: 4067 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4068 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4069 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4070 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4071 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4072 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4073 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4074 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4075 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4076 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4077 case MachineCombinerPattern::FMLSv4f16_OP1: 4078 case MachineCombinerPattern::FMLSv4f16_OP2: 4079 case MachineCombinerPattern::FMLSv8f16_OP1: 4080 case MachineCombinerPattern::FMLSv8f16_OP2: 4081 case MachineCombinerPattern::FMLSv2f32_OP2: 4082 case MachineCombinerPattern::FMLSv2f64_OP2: 4083 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4084 case MachineCombinerPattern::FMLSv4f32_OP2: 4085 case MachineCombinerPattern::MULADDv8i8_OP1: 4086 case MachineCombinerPattern::MULADDv8i8_OP2: 4087 case MachineCombinerPattern::MULADDv16i8_OP1: 4088 case MachineCombinerPattern::MULADDv16i8_OP2: 4089 case MachineCombinerPattern::MULADDv4i16_OP1: 4090 case MachineCombinerPattern::MULADDv4i16_OP2: 4091 case MachineCombinerPattern::MULADDv8i16_OP1: 4092 case MachineCombinerPattern::MULADDv8i16_OP2: 4093 case MachineCombinerPattern::MULADDv2i32_OP1: 4094 case MachineCombinerPattern::MULADDv2i32_OP2: 4095 case MachineCombinerPattern::MULADDv4i32_OP1: 4096 case MachineCombinerPattern::MULADDv4i32_OP2: 4097 case MachineCombinerPattern::MULSUBv8i8_OP1: 4098 case MachineCombinerPattern::MULSUBv8i8_OP2: 4099 case MachineCombinerPattern::MULSUBv16i8_OP1: 4100 case MachineCombinerPattern::MULSUBv16i8_OP2: 4101 case MachineCombinerPattern::MULSUBv4i16_OP1: 4102 case MachineCombinerPattern::MULSUBv4i16_OP2: 4103 case MachineCombinerPattern::MULSUBv8i16_OP1: 4104 case MachineCombinerPattern::MULSUBv8i16_OP2: 4105 case MachineCombinerPattern::MULSUBv2i32_OP1: 4106 case MachineCombinerPattern::MULSUBv2i32_OP2: 4107 case MachineCombinerPattern::MULSUBv4i32_OP1: 4108 case MachineCombinerPattern::MULSUBv4i32_OP2: 4109 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4110 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4111 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4112 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4113 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4114 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4115 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4116 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4117 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4118 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4119 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4120 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4121 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4122 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4123 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4124 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4125 return true; 4126 } // end switch (Pattern) 4127 return false; 4128 } 4129 /// Return true when there is potentially a faster code sequence for an 4130 /// instruction chain ending in \p Root. All potential patterns are listed in 4131 /// the \p Pattern vector. Pattern should be sorted in priority order since the 4132 /// pattern evaluator stops checking as soon as it finds a faster sequence. 4133 4134 bool AArch64InstrInfo::getMachineCombinerPatterns( 4135 MachineInstr &Root, 4136 SmallVectorImpl<MachineCombinerPattern> &Patterns) const { 4137 // Integer patterns 4138 if (getMaddPatterns(Root, Patterns)) 4139 return true; 4140 // Floating point patterns 4141 if (getFMAPatterns(Root, Patterns)) 4142 return true; 4143 4144 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); 4145 } 4146 4147 enum class FMAInstKind { Default, Indexed, Accumulator }; 4148 /// genFusedMultiply - Generate fused multiply instructions. 4149 /// This function supports both integer and floating point instructions. 4150 /// A typical example: 4151 /// F|MUL I=A,B,0 4152 /// F|ADD R,I,C 4153 /// ==> F|MADD R,A,B,C 4154 /// \param MF Containing MachineFunction 4155 /// \param MRI Register information 4156 /// \param TII Target information 4157 /// \param Root is the F|ADD instruction 4158 /// \param [out] InsInstrs is a vector of machine instructions and will 4159 /// contain the generated madd instruction 4160 /// \param IdxMulOpd is index of operand in Root that is the result of 4161 /// the F|MUL. In the example above IdxMulOpd is 1. 4162 /// \param MaddOpc the opcode fo the f|madd instruction 4163 /// \param RC Register class of operands 4164 /// \param kind of fma instruction (addressing mode) to be generated 4165 /// \param ReplacedAddend is the result register from the instruction 4166 /// replacing the non-combined operand, if any. 4167 static MachineInstr * 4168 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 4169 const TargetInstrInfo *TII, MachineInstr &Root, 4170 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 4171 unsigned MaddOpc, const TargetRegisterClass *RC, 4172 FMAInstKind kind = FMAInstKind::Default, 4173 const Register *ReplacedAddend = nullptr) { 4174 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4175 4176 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 4177 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4178 Register ResultReg = Root.getOperand(0).getReg(); 4179 Register SrcReg0 = MUL->getOperand(1).getReg(); 4180 bool Src0IsKill = MUL->getOperand(1).isKill(); 4181 Register SrcReg1 = MUL->getOperand(2).getReg(); 4182 bool Src1IsKill = MUL->getOperand(2).isKill(); 4183 4184 unsigned SrcReg2; 4185 bool Src2IsKill; 4186 if (ReplacedAddend) { 4187 // If we just generated a new addend, we must be it's only use. 4188 SrcReg2 = *ReplacedAddend; 4189 Src2IsKill = true; 4190 } else { 4191 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 4192 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 4193 } 4194 4195 if (Register::isVirtualRegister(ResultReg)) 4196 MRI.constrainRegClass(ResultReg, RC); 4197 if (Register::isVirtualRegister(SrcReg0)) 4198 MRI.constrainRegClass(SrcReg0, RC); 4199 if (Register::isVirtualRegister(SrcReg1)) 4200 MRI.constrainRegClass(SrcReg1, RC); 4201 if (Register::isVirtualRegister(SrcReg2)) 4202 MRI.constrainRegClass(SrcReg2, RC); 4203 4204 MachineInstrBuilder MIB; 4205 if (kind == FMAInstKind::Default) 4206 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4207 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4208 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4209 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 4210 else if (kind == FMAInstKind::Indexed) 4211 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4212 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4213 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4214 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4215 .addImm(MUL->getOperand(3).getImm()); 4216 else if (kind == FMAInstKind::Accumulator) 4217 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4218 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 4219 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4220 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 4221 else 4222 assert(false && "Invalid FMA instruction kind \n"); 4223 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 4224 InsInstrs.push_back(MIB); 4225 return MUL; 4226 } 4227 4228 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 4229 /// instructions. 4230 /// 4231 /// \see genFusedMultiply 4232 static MachineInstr *genFusedMultiplyAcc( 4233 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4234 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4235 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4236 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4237 FMAInstKind::Accumulator); 4238 } 4239 4240 /// genNeg - Helper to generate an intermediate negation of the second operand 4241 /// of Root 4242 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 4243 const TargetInstrInfo *TII, MachineInstr &Root, 4244 SmallVectorImpl<MachineInstr *> &InsInstrs, 4245 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 4246 unsigned MnegOpc, const TargetRegisterClass *RC) { 4247 Register NewVR = MRI.createVirtualRegister(RC); 4248 MachineInstrBuilder MIB = 4249 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 4250 .add(Root.getOperand(2)); 4251 InsInstrs.push_back(MIB); 4252 4253 assert(InstrIdxForVirtReg.empty()); 4254 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4255 4256 return NewVR; 4257 } 4258 4259 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4260 /// instructions with an additional negation of the accumulator 4261 static MachineInstr *genFusedMultiplyAccNeg( 4262 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4263 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4264 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4265 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4266 assert(IdxMulOpd == 1); 4267 4268 Register NewVR = 4269 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4270 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4271 FMAInstKind::Accumulator, &NewVR); 4272 } 4273 4274 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 4275 /// instructions. 4276 /// 4277 /// \see genFusedMultiply 4278 static MachineInstr *genFusedMultiplyIdx( 4279 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4280 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4281 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 4282 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4283 FMAInstKind::Indexed); 4284 } 4285 4286 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 4287 /// instructions with an additional negation of the accumulator 4288 static MachineInstr *genFusedMultiplyIdxNeg( 4289 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 4290 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 4291 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 4292 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 4293 assert(IdxMulOpd == 1); 4294 4295 Register NewVR = 4296 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 4297 4298 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 4299 FMAInstKind::Indexed, &NewVR); 4300 } 4301 4302 /// genMaddR - Generate madd instruction and combine mul and add using 4303 /// an extra virtual register 4304 /// Example - an ADD intermediate needs to be stored in a register: 4305 /// MUL I=A,B,0 4306 /// ADD R,I,Imm 4307 /// ==> ORR V, ZR, Imm 4308 /// ==> MADD R,A,B,V 4309 /// \param MF Containing MachineFunction 4310 /// \param MRI Register information 4311 /// \param TII Target information 4312 /// \param Root is the ADD instruction 4313 /// \param [out] InsInstrs is a vector of machine instructions and will 4314 /// contain the generated madd instruction 4315 /// \param IdxMulOpd is index of operand in Root that is the result of 4316 /// the MUL. In the example above IdxMulOpd is 1. 4317 /// \param MaddOpc the opcode fo the madd instruction 4318 /// \param VR is a virtual register that holds the value of an ADD operand 4319 /// (V in the example above). 4320 /// \param RC Register class of operands 4321 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 4322 const TargetInstrInfo *TII, MachineInstr &Root, 4323 SmallVectorImpl<MachineInstr *> &InsInstrs, 4324 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 4325 const TargetRegisterClass *RC) { 4326 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 4327 4328 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 4329 Register ResultReg = Root.getOperand(0).getReg(); 4330 Register SrcReg0 = MUL->getOperand(1).getReg(); 4331 bool Src0IsKill = MUL->getOperand(1).isKill(); 4332 Register SrcReg1 = MUL->getOperand(2).getReg(); 4333 bool Src1IsKill = MUL->getOperand(2).isKill(); 4334 4335 if (Register::isVirtualRegister(ResultReg)) 4336 MRI.constrainRegClass(ResultReg, RC); 4337 if (Register::isVirtualRegister(SrcReg0)) 4338 MRI.constrainRegClass(SrcReg0, RC); 4339 if (Register::isVirtualRegister(SrcReg1)) 4340 MRI.constrainRegClass(SrcReg1, RC); 4341 if (Register::isVirtualRegister(VR)) 4342 MRI.constrainRegClass(VR, RC); 4343 4344 MachineInstrBuilder MIB = 4345 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 4346 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 4347 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 4348 .addReg(VR); 4349 // Insert the MADD 4350 InsInstrs.push_back(MIB); 4351 return MUL; 4352 } 4353 4354 /// When getMachineCombinerPatterns() finds potential patterns, 4355 /// this function generates the instructions that could replace the 4356 /// original code sequence 4357 void AArch64InstrInfo::genAlternativeCodeSequence( 4358 MachineInstr &Root, MachineCombinerPattern Pattern, 4359 SmallVectorImpl<MachineInstr *> &InsInstrs, 4360 SmallVectorImpl<MachineInstr *> &DelInstrs, 4361 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 4362 MachineBasicBlock &MBB = *Root.getParent(); 4363 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4364 MachineFunction &MF = *MBB.getParent(); 4365 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 4366 4367 MachineInstr *MUL; 4368 const TargetRegisterClass *RC; 4369 unsigned Opc; 4370 switch (Pattern) { 4371 default: 4372 // Reassociate instructions. 4373 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 4374 DelInstrs, InstrIdxForVirtReg); 4375 return; 4376 case MachineCombinerPattern::MULADDW_OP1: 4377 case MachineCombinerPattern::MULADDX_OP1: 4378 // MUL I=A,B,0 4379 // ADD R,I,C 4380 // ==> MADD R,A,B,C 4381 // --- Create(MADD); 4382 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 4383 Opc = AArch64::MADDWrrr; 4384 RC = &AArch64::GPR32RegClass; 4385 } else { 4386 Opc = AArch64::MADDXrrr; 4387 RC = &AArch64::GPR64RegClass; 4388 } 4389 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4390 break; 4391 case MachineCombinerPattern::MULADDW_OP2: 4392 case MachineCombinerPattern::MULADDX_OP2: 4393 // MUL I=A,B,0 4394 // ADD R,C,I 4395 // ==> MADD R,A,B,C 4396 // --- Create(MADD); 4397 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 4398 Opc = AArch64::MADDWrrr; 4399 RC = &AArch64::GPR32RegClass; 4400 } else { 4401 Opc = AArch64::MADDXrrr; 4402 RC = &AArch64::GPR64RegClass; 4403 } 4404 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4405 break; 4406 case MachineCombinerPattern::MULADDWI_OP1: 4407 case MachineCombinerPattern::MULADDXI_OP1: { 4408 // MUL I=A,B,0 4409 // ADD R,I,Imm 4410 // ==> ORR V, ZR, Imm 4411 // ==> MADD R,A,B,V 4412 // --- Create(MADD); 4413 const TargetRegisterClass *OrrRC; 4414 unsigned BitSize, OrrOpc, ZeroReg; 4415 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 4416 OrrOpc = AArch64::ORRWri; 4417 OrrRC = &AArch64::GPR32spRegClass; 4418 BitSize = 32; 4419 ZeroReg = AArch64::WZR; 4420 Opc = AArch64::MADDWrrr; 4421 RC = &AArch64::GPR32RegClass; 4422 } else { 4423 OrrOpc = AArch64::ORRXri; 4424 OrrRC = &AArch64::GPR64spRegClass; 4425 BitSize = 64; 4426 ZeroReg = AArch64::XZR; 4427 Opc = AArch64::MADDXrrr; 4428 RC = &AArch64::GPR64RegClass; 4429 } 4430 Register NewVR = MRI.createVirtualRegister(OrrRC); 4431 uint64_t Imm = Root.getOperand(2).getImm(); 4432 4433 if (Root.getOperand(3).isImm()) { 4434 unsigned Val = Root.getOperand(3).getImm(); 4435 Imm = Imm << Val; 4436 } 4437 uint64_t UImm = SignExtend64(Imm, BitSize); 4438 uint64_t Encoding; 4439 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4440 MachineInstrBuilder MIB1 = 4441 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4442 .addReg(ZeroReg) 4443 .addImm(Encoding); 4444 InsInstrs.push_back(MIB1); 4445 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4446 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4447 } 4448 break; 4449 } 4450 case MachineCombinerPattern::MULSUBW_OP1: 4451 case MachineCombinerPattern::MULSUBX_OP1: { 4452 // MUL I=A,B,0 4453 // SUB R,I, C 4454 // ==> SUB V, 0, C 4455 // ==> MADD R,A,B,V // = -C + A*B 4456 // --- Create(MADD); 4457 const TargetRegisterClass *SubRC; 4458 unsigned SubOpc, ZeroReg; 4459 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 4460 SubOpc = AArch64::SUBWrr; 4461 SubRC = &AArch64::GPR32spRegClass; 4462 ZeroReg = AArch64::WZR; 4463 Opc = AArch64::MADDWrrr; 4464 RC = &AArch64::GPR32RegClass; 4465 } else { 4466 SubOpc = AArch64::SUBXrr; 4467 SubRC = &AArch64::GPR64spRegClass; 4468 ZeroReg = AArch64::XZR; 4469 Opc = AArch64::MADDXrrr; 4470 RC = &AArch64::GPR64RegClass; 4471 } 4472 Register NewVR = MRI.createVirtualRegister(SubRC); 4473 // SUB NewVR, 0, C 4474 MachineInstrBuilder MIB1 = 4475 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 4476 .addReg(ZeroReg) 4477 .add(Root.getOperand(2)); 4478 InsInstrs.push_back(MIB1); 4479 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4480 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4481 break; 4482 } 4483 case MachineCombinerPattern::MULSUBW_OP2: 4484 case MachineCombinerPattern::MULSUBX_OP2: 4485 // MUL I=A,B,0 4486 // SUB R,C,I 4487 // ==> MSUB R,A,B,C (computes C - A*B) 4488 // --- Create(MSUB); 4489 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 4490 Opc = AArch64::MSUBWrrr; 4491 RC = &AArch64::GPR32RegClass; 4492 } else { 4493 Opc = AArch64::MSUBXrrr; 4494 RC = &AArch64::GPR64RegClass; 4495 } 4496 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4497 break; 4498 case MachineCombinerPattern::MULSUBWI_OP1: 4499 case MachineCombinerPattern::MULSUBXI_OP1: { 4500 // MUL I=A,B,0 4501 // SUB R,I, Imm 4502 // ==> ORR V, ZR, -Imm 4503 // ==> MADD R,A,B,V // = -Imm + A*B 4504 // --- Create(MADD); 4505 const TargetRegisterClass *OrrRC; 4506 unsigned BitSize, OrrOpc, ZeroReg; 4507 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 4508 OrrOpc = AArch64::ORRWri; 4509 OrrRC = &AArch64::GPR32spRegClass; 4510 BitSize = 32; 4511 ZeroReg = AArch64::WZR; 4512 Opc = AArch64::MADDWrrr; 4513 RC = &AArch64::GPR32RegClass; 4514 } else { 4515 OrrOpc = AArch64::ORRXri; 4516 OrrRC = &AArch64::GPR64spRegClass; 4517 BitSize = 64; 4518 ZeroReg = AArch64::XZR; 4519 Opc = AArch64::MADDXrrr; 4520 RC = &AArch64::GPR64RegClass; 4521 } 4522 Register NewVR = MRI.createVirtualRegister(OrrRC); 4523 uint64_t Imm = Root.getOperand(2).getImm(); 4524 if (Root.getOperand(3).isImm()) { 4525 unsigned Val = Root.getOperand(3).getImm(); 4526 Imm = Imm << Val; 4527 } 4528 uint64_t UImm = SignExtend64(-Imm, BitSize); 4529 uint64_t Encoding; 4530 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 4531 MachineInstrBuilder MIB1 = 4532 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 4533 .addReg(ZeroReg) 4534 .addImm(Encoding); 4535 InsInstrs.push_back(MIB1); 4536 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 4537 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 4538 } 4539 break; 4540 } 4541 4542 case MachineCombinerPattern::MULADDv8i8_OP1: 4543 Opc = AArch64::MLAv8i8; 4544 RC = &AArch64::FPR64RegClass; 4545 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4546 break; 4547 case MachineCombinerPattern::MULADDv8i8_OP2: 4548 Opc = AArch64::MLAv8i8; 4549 RC = &AArch64::FPR64RegClass; 4550 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4551 break; 4552 case MachineCombinerPattern::MULADDv16i8_OP1: 4553 Opc = AArch64::MLAv16i8; 4554 RC = &AArch64::FPR128RegClass; 4555 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4556 break; 4557 case MachineCombinerPattern::MULADDv16i8_OP2: 4558 Opc = AArch64::MLAv16i8; 4559 RC = &AArch64::FPR128RegClass; 4560 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4561 break; 4562 case MachineCombinerPattern::MULADDv4i16_OP1: 4563 Opc = AArch64::MLAv4i16; 4564 RC = &AArch64::FPR64RegClass; 4565 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4566 break; 4567 case MachineCombinerPattern::MULADDv4i16_OP2: 4568 Opc = AArch64::MLAv4i16; 4569 RC = &AArch64::FPR64RegClass; 4570 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4571 break; 4572 case MachineCombinerPattern::MULADDv8i16_OP1: 4573 Opc = AArch64::MLAv8i16; 4574 RC = &AArch64::FPR128RegClass; 4575 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4576 break; 4577 case MachineCombinerPattern::MULADDv8i16_OP2: 4578 Opc = AArch64::MLAv8i16; 4579 RC = &AArch64::FPR128RegClass; 4580 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4581 break; 4582 case MachineCombinerPattern::MULADDv2i32_OP1: 4583 Opc = AArch64::MLAv2i32; 4584 RC = &AArch64::FPR64RegClass; 4585 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4586 break; 4587 case MachineCombinerPattern::MULADDv2i32_OP2: 4588 Opc = AArch64::MLAv2i32; 4589 RC = &AArch64::FPR64RegClass; 4590 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4591 break; 4592 case MachineCombinerPattern::MULADDv4i32_OP1: 4593 Opc = AArch64::MLAv4i32; 4594 RC = &AArch64::FPR128RegClass; 4595 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4596 break; 4597 case MachineCombinerPattern::MULADDv4i32_OP2: 4598 Opc = AArch64::MLAv4i32; 4599 RC = &AArch64::FPR128RegClass; 4600 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4601 break; 4602 4603 case MachineCombinerPattern::MULSUBv8i8_OP1: 4604 Opc = AArch64::MLAv8i8; 4605 RC = &AArch64::FPR64RegClass; 4606 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4607 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 4608 RC); 4609 break; 4610 case MachineCombinerPattern::MULSUBv8i8_OP2: 4611 Opc = AArch64::MLSv8i8; 4612 RC = &AArch64::FPR64RegClass; 4613 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4614 break; 4615 case MachineCombinerPattern::MULSUBv16i8_OP1: 4616 Opc = AArch64::MLAv16i8; 4617 RC = &AArch64::FPR128RegClass; 4618 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4619 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 4620 RC); 4621 break; 4622 case MachineCombinerPattern::MULSUBv16i8_OP2: 4623 Opc = AArch64::MLSv16i8; 4624 RC = &AArch64::FPR128RegClass; 4625 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4626 break; 4627 case MachineCombinerPattern::MULSUBv4i16_OP1: 4628 Opc = AArch64::MLAv4i16; 4629 RC = &AArch64::FPR64RegClass; 4630 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4631 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 4632 RC); 4633 break; 4634 case MachineCombinerPattern::MULSUBv4i16_OP2: 4635 Opc = AArch64::MLSv4i16; 4636 RC = &AArch64::FPR64RegClass; 4637 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4638 break; 4639 case MachineCombinerPattern::MULSUBv8i16_OP1: 4640 Opc = AArch64::MLAv8i16; 4641 RC = &AArch64::FPR128RegClass; 4642 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4643 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 4644 RC); 4645 break; 4646 case MachineCombinerPattern::MULSUBv8i16_OP2: 4647 Opc = AArch64::MLSv8i16; 4648 RC = &AArch64::FPR128RegClass; 4649 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4650 break; 4651 case MachineCombinerPattern::MULSUBv2i32_OP1: 4652 Opc = AArch64::MLAv2i32; 4653 RC = &AArch64::FPR64RegClass; 4654 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4655 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 4656 RC); 4657 break; 4658 case MachineCombinerPattern::MULSUBv2i32_OP2: 4659 Opc = AArch64::MLSv2i32; 4660 RC = &AArch64::FPR64RegClass; 4661 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4662 break; 4663 case MachineCombinerPattern::MULSUBv4i32_OP1: 4664 Opc = AArch64::MLAv4i32; 4665 RC = &AArch64::FPR128RegClass; 4666 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 4667 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 4668 RC); 4669 break; 4670 case MachineCombinerPattern::MULSUBv4i32_OP2: 4671 Opc = AArch64::MLSv4i32; 4672 RC = &AArch64::FPR128RegClass; 4673 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4674 break; 4675 4676 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4677 Opc = AArch64::MLAv4i16_indexed; 4678 RC = &AArch64::FPR64RegClass; 4679 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4680 break; 4681 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4682 Opc = AArch64::MLAv4i16_indexed; 4683 RC = &AArch64::FPR64RegClass; 4684 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4685 break; 4686 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4687 Opc = AArch64::MLAv8i16_indexed; 4688 RC = &AArch64::FPR128RegClass; 4689 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4690 break; 4691 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4692 Opc = AArch64::MLAv8i16_indexed; 4693 RC = &AArch64::FPR128RegClass; 4694 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4695 break; 4696 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4697 Opc = AArch64::MLAv2i32_indexed; 4698 RC = &AArch64::FPR64RegClass; 4699 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4700 break; 4701 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4702 Opc = AArch64::MLAv2i32_indexed; 4703 RC = &AArch64::FPR64RegClass; 4704 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4705 break; 4706 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4707 Opc = AArch64::MLAv4i32_indexed; 4708 RC = &AArch64::FPR128RegClass; 4709 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4710 break; 4711 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4712 Opc = AArch64::MLAv4i32_indexed; 4713 RC = &AArch64::FPR128RegClass; 4714 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4715 break; 4716 4717 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4718 Opc = AArch64::MLAv4i16_indexed; 4719 RC = &AArch64::FPR64RegClass; 4720 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4721 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 4722 RC); 4723 break; 4724 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4725 Opc = AArch64::MLSv4i16_indexed; 4726 RC = &AArch64::FPR64RegClass; 4727 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4728 break; 4729 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4730 Opc = AArch64::MLAv8i16_indexed; 4731 RC = &AArch64::FPR128RegClass; 4732 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4733 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 4734 RC); 4735 break; 4736 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4737 Opc = AArch64::MLSv8i16_indexed; 4738 RC = &AArch64::FPR128RegClass; 4739 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4740 break; 4741 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4742 Opc = AArch64::MLAv2i32_indexed; 4743 RC = &AArch64::FPR64RegClass; 4744 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4745 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 4746 RC); 4747 break; 4748 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4749 Opc = AArch64::MLSv2i32_indexed; 4750 RC = &AArch64::FPR64RegClass; 4751 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4752 break; 4753 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4754 Opc = AArch64::MLAv4i32_indexed; 4755 RC = &AArch64::FPR128RegClass; 4756 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 4757 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 4758 RC); 4759 break; 4760 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4761 Opc = AArch64::MLSv4i32_indexed; 4762 RC = &AArch64::FPR128RegClass; 4763 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4764 break; 4765 4766 // Floating Point Support 4767 case MachineCombinerPattern::FMULADDH_OP1: 4768 Opc = AArch64::FMADDHrrr; 4769 RC = &AArch64::FPR16RegClass; 4770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4771 break; 4772 case MachineCombinerPattern::FMULADDS_OP1: 4773 Opc = AArch64::FMADDSrrr; 4774 RC = &AArch64::FPR32RegClass; 4775 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4776 break; 4777 case MachineCombinerPattern::FMULADDD_OP1: 4778 Opc = AArch64::FMADDDrrr; 4779 RC = &AArch64::FPR64RegClass; 4780 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4781 break; 4782 4783 case MachineCombinerPattern::FMULADDH_OP2: 4784 Opc = AArch64::FMADDHrrr; 4785 RC = &AArch64::FPR16RegClass; 4786 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4787 break; 4788 case MachineCombinerPattern::FMULADDS_OP2: 4789 Opc = AArch64::FMADDSrrr; 4790 RC = &AArch64::FPR32RegClass; 4791 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4792 break; 4793 case MachineCombinerPattern::FMULADDD_OP2: 4794 Opc = AArch64::FMADDDrrr; 4795 RC = &AArch64::FPR64RegClass; 4796 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4797 break; 4798 4799 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4800 Opc = AArch64::FMLAv1i32_indexed; 4801 RC = &AArch64::FPR32RegClass; 4802 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4803 FMAInstKind::Indexed); 4804 break; 4805 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4806 Opc = AArch64::FMLAv1i32_indexed; 4807 RC = &AArch64::FPR32RegClass; 4808 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4809 FMAInstKind::Indexed); 4810 break; 4811 4812 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4813 Opc = AArch64::FMLAv1i64_indexed; 4814 RC = &AArch64::FPR64RegClass; 4815 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4816 FMAInstKind::Indexed); 4817 break; 4818 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4819 Opc = AArch64::FMLAv1i64_indexed; 4820 RC = &AArch64::FPR64RegClass; 4821 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4822 FMAInstKind::Indexed); 4823 break; 4824 4825 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4826 RC = &AArch64::FPR64RegClass; 4827 Opc = AArch64::FMLAv4i16_indexed; 4828 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4829 FMAInstKind::Indexed); 4830 break; 4831 case MachineCombinerPattern::FMLAv4f16_OP1: 4832 RC = &AArch64::FPR64RegClass; 4833 Opc = AArch64::FMLAv4f16; 4834 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4835 FMAInstKind::Accumulator); 4836 break; 4837 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4838 RC = &AArch64::FPR64RegClass; 4839 Opc = AArch64::FMLAv4i16_indexed; 4840 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4841 FMAInstKind::Indexed); 4842 break; 4843 case MachineCombinerPattern::FMLAv4f16_OP2: 4844 RC = &AArch64::FPR64RegClass; 4845 Opc = AArch64::FMLAv4f16; 4846 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4847 FMAInstKind::Accumulator); 4848 break; 4849 4850 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4851 case MachineCombinerPattern::FMLAv2f32_OP1: 4852 RC = &AArch64::FPR64RegClass; 4853 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 4854 Opc = AArch64::FMLAv2i32_indexed; 4855 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4856 FMAInstKind::Indexed); 4857 } else { 4858 Opc = AArch64::FMLAv2f32; 4859 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4860 FMAInstKind::Accumulator); 4861 } 4862 break; 4863 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4864 case MachineCombinerPattern::FMLAv2f32_OP2: 4865 RC = &AArch64::FPR64RegClass; 4866 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 4867 Opc = AArch64::FMLAv2i32_indexed; 4868 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4869 FMAInstKind::Indexed); 4870 } else { 4871 Opc = AArch64::FMLAv2f32; 4872 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4873 FMAInstKind::Accumulator); 4874 } 4875 break; 4876 4877 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4878 RC = &AArch64::FPR128RegClass; 4879 Opc = AArch64::FMLAv8i16_indexed; 4880 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4881 FMAInstKind::Indexed); 4882 break; 4883 case MachineCombinerPattern::FMLAv8f16_OP1: 4884 RC = &AArch64::FPR128RegClass; 4885 Opc = AArch64::FMLAv8f16; 4886 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4887 FMAInstKind::Accumulator); 4888 break; 4889 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4890 RC = &AArch64::FPR128RegClass; 4891 Opc = AArch64::FMLAv8i16_indexed; 4892 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4893 FMAInstKind::Indexed); 4894 break; 4895 case MachineCombinerPattern::FMLAv8f16_OP2: 4896 RC = &AArch64::FPR128RegClass; 4897 Opc = AArch64::FMLAv8f16; 4898 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4899 FMAInstKind::Accumulator); 4900 break; 4901 4902 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4903 case MachineCombinerPattern::FMLAv2f64_OP1: 4904 RC = &AArch64::FPR128RegClass; 4905 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 4906 Opc = AArch64::FMLAv2i64_indexed; 4907 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4908 FMAInstKind::Indexed); 4909 } else { 4910 Opc = AArch64::FMLAv2f64; 4911 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4912 FMAInstKind::Accumulator); 4913 } 4914 break; 4915 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4916 case MachineCombinerPattern::FMLAv2f64_OP2: 4917 RC = &AArch64::FPR128RegClass; 4918 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 4919 Opc = AArch64::FMLAv2i64_indexed; 4920 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4921 FMAInstKind::Indexed); 4922 } else { 4923 Opc = AArch64::FMLAv2f64; 4924 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4925 FMAInstKind::Accumulator); 4926 } 4927 break; 4928 4929 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4930 case MachineCombinerPattern::FMLAv4f32_OP1: 4931 RC = &AArch64::FPR128RegClass; 4932 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 4933 Opc = AArch64::FMLAv4i32_indexed; 4934 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4935 FMAInstKind::Indexed); 4936 } else { 4937 Opc = AArch64::FMLAv4f32; 4938 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 4939 FMAInstKind::Accumulator); 4940 } 4941 break; 4942 4943 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4944 case MachineCombinerPattern::FMLAv4f32_OP2: 4945 RC = &AArch64::FPR128RegClass; 4946 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 4947 Opc = AArch64::FMLAv4i32_indexed; 4948 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4949 FMAInstKind::Indexed); 4950 } else { 4951 Opc = AArch64::FMLAv4f32; 4952 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 4953 FMAInstKind::Accumulator); 4954 } 4955 break; 4956 4957 case MachineCombinerPattern::FMULSUBH_OP1: 4958 Opc = AArch64::FNMSUBHrrr; 4959 RC = &AArch64::FPR16RegClass; 4960 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4961 break; 4962 case MachineCombinerPattern::FMULSUBS_OP1: 4963 Opc = AArch64::FNMSUBSrrr; 4964 RC = &AArch64::FPR32RegClass; 4965 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4966 break; 4967 case MachineCombinerPattern::FMULSUBD_OP1: 4968 Opc = AArch64::FNMSUBDrrr; 4969 RC = &AArch64::FPR64RegClass; 4970 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4971 break; 4972 4973 case MachineCombinerPattern::FNMULSUBH_OP1: 4974 Opc = AArch64::FNMADDHrrr; 4975 RC = &AArch64::FPR16RegClass; 4976 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4977 break; 4978 case MachineCombinerPattern::FNMULSUBS_OP1: 4979 Opc = AArch64::FNMADDSrrr; 4980 RC = &AArch64::FPR32RegClass; 4981 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4982 break; 4983 case MachineCombinerPattern::FNMULSUBD_OP1: 4984 Opc = AArch64::FNMADDDrrr; 4985 RC = &AArch64::FPR64RegClass; 4986 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 4987 break; 4988 4989 case MachineCombinerPattern::FMULSUBH_OP2: 4990 Opc = AArch64::FMSUBHrrr; 4991 RC = &AArch64::FPR16RegClass; 4992 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4993 break; 4994 case MachineCombinerPattern::FMULSUBS_OP2: 4995 Opc = AArch64::FMSUBSrrr; 4996 RC = &AArch64::FPR32RegClass; 4997 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 4998 break; 4999 case MachineCombinerPattern::FMULSUBD_OP2: 5000 Opc = AArch64::FMSUBDrrr; 5001 RC = &AArch64::FPR64RegClass; 5002 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5003 break; 5004 5005 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5006 Opc = AArch64::FMLSv1i32_indexed; 5007 RC = &AArch64::FPR32RegClass; 5008 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5009 FMAInstKind::Indexed); 5010 break; 5011 5012 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5013 Opc = AArch64::FMLSv1i64_indexed; 5014 RC = &AArch64::FPR64RegClass; 5015 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5016 FMAInstKind::Indexed); 5017 break; 5018 5019 case MachineCombinerPattern::FMLSv4f16_OP1: 5020 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5021 RC = &AArch64::FPR64RegClass; 5022 Register NewVR = MRI.createVirtualRegister(RC); 5023 MachineInstrBuilder MIB1 = 5024 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5025 .add(Root.getOperand(2)); 5026 InsInstrs.push_back(MIB1); 5027 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5028 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5029 Opc = AArch64::FMLAv4f16; 5030 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5031 FMAInstKind::Accumulator, &NewVR); 5032 } else { 5033 Opc = AArch64::FMLAv4i16_indexed; 5034 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5035 FMAInstKind::Indexed, &NewVR); 5036 } 5037 break; 5038 } 5039 case MachineCombinerPattern::FMLSv4f16_OP2: 5040 RC = &AArch64::FPR64RegClass; 5041 Opc = AArch64::FMLSv4f16; 5042 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5043 FMAInstKind::Accumulator); 5044 break; 5045 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5046 RC = &AArch64::FPR64RegClass; 5047 Opc = AArch64::FMLSv4i16_indexed; 5048 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5049 FMAInstKind::Indexed); 5050 break; 5051 5052 case MachineCombinerPattern::FMLSv2f32_OP2: 5053 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5054 RC = &AArch64::FPR64RegClass; 5055 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5056 Opc = AArch64::FMLSv2i32_indexed; 5057 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5058 FMAInstKind::Indexed); 5059 } else { 5060 Opc = AArch64::FMLSv2f32; 5061 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5062 FMAInstKind::Accumulator); 5063 } 5064 break; 5065 5066 case MachineCombinerPattern::FMLSv8f16_OP1: 5067 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5068 RC = &AArch64::FPR128RegClass; 5069 Register NewVR = MRI.createVirtualRegister(RC); 5070 MachineInstrBuilder MIB1 = 5071 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5072 .add(Root.getOperand(2)); 5073 InsInstrs.push_back(MIB1); 5074 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5075 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5076 Opc = AArch64::FMLAv8f16; 5077 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5078 FMAInstKind::Accumulator, &NewVR); 5079 } else { 5080 Opc = AArch64::FMLAv8i16_indexed; 5081 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5082 FMAInstKind::Indexed, &NewVR); 5083 } 5084 break; 5085 } 5086 case MachineCombinerPattern::FMLSv8f16_OP2: 5087 RC = &AArch64::FPR128RegClass; 5088 Opc = AArch64::FMLSv8f16; 5089 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5090 FMAInstKind::Accumulator); 5091 break; 5092 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5093 RC = &AArch64::FPR128RegClass; 5094 Opc = AArch64::FMLSv8i16_indexed; 5095 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5096 FMAInstKind::Indexed); 5097 break; 5098 5099 case MachineCombinerPattern::FMLSv2f64_OP2: 5100 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5101 RC = &AArch64::FPR128RegClass; 5102 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 5103 Opc = AArch64::FMLSv2i64_indexed; 5104 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5105 FMAInstKind::Indexed); 5106 } else { 5107 Opc = AArch64::FMLSv2f64; 5108 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5109 FMAInstKind::Accumulator); 5110 } 5111 break; 5112 5113 case MachineCombinerPattern::FMLSv4f32_OP2: 5114 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5115 RC = &AArch64::FPR128RegClass; 5116 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 5117 Opc = AArch64::FMLSv4i32_indexed; 5118 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5119 FMAInstKind::Indexed); 5120 } else { 5121 Opc = AArch64::FMLSv4f32; 5122 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5123 FMAInstKind::Accumulator); 5124 } 5125 break; 5126 case MachineCombinerPattern::FMLSv2f32_OP1: 5127 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 5128 RC = &AArch64::FPR64RegClass; 5129 Register NewVR = MRI.createVirtualRegister(RC); 5130 MachineInstrBuilder MIB1 = 5131 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 5132 .add(Root.getOperand(2)); 5133 InsInstrs.push_back(MIB1); 5134 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5135 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 5136 Opc = AArch64::FMLAv2i32_indexed; 5137 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5138 FMAInstKind::Indexed, &NewVR); 5139 } else { 5140 Opc = AArch64::FMLAv2f32; 5141 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5142 FMAInstKind::Accumulator, &NewVR); 5143 } 5144 break; 5145 } 5146 case MachineCombinerPattern::FMLSv4f32_OP1: 5147 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 5148 RC = &AArch64::FPR128RegClass; 5149 Register NewVR = MRI.createVirtualRegister(RC); 5150 MachineInstrBuilder MIB1 = 5151 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 5152 .add(Root.getOperand(2)); 5153 InsInstrs.push_back(MIB1); 5154 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5155 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 5156 Opc = AArch64::FMLAv4i32_indexed; 5157 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5158 FMAInstKind::Indexed, &NewVR); 5159 } else { 5160 Opc = AArch64::FMLAv4f32; 5161 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5162 FMAInstKind::Accumulator, &NewVR); 5163 } 5164 break; 5165 } 5166 case MachineCombinerPattern::FMLSv2f64_OP1: 5167 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 5168 RC = &AArch64::FPR128RegClass; 5169 Register NewVR = MRI.createVirtualRegister(RC); 5170 MachineInstrBuilder MIB1 = 5171 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 5172 .add(Root.getOperand(2)); 5173 InsInstrs.push_back(MIB1); 5174 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5175 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 5176 Opc = AArch64::FMLAv2i64_indexed; 5177 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5178 FMAInstKind::Indexed, &NewVR); 5179 } else { 5180 Opc = AArch64::FMLAv2f64; 5181 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5182 FMAInstKind::Accumulator, &NewVR); 5183 } 5184 break; 5185 } 5186 } // end switch (Pattern) 5187 // Record MUL and ADD/SUB for deletion 5188 DelInstrs.push_back(MUL); 5189 DelInstrs.push_back(&Root); 5190 } 5191 5192 /// Replace csincr-branch sequence by simple conditional branch 5193 /// 5194 /// Examples: 5195 /// 1. \code 5196 /// csinc w9, wzr, wzr, <condition code> 5197 /// tbnz w9, #0, 0x44 5198 /// \endcode 5199 /// to 5200 /// \code 5201 /// b.<inverted condition code> 5202 /// \endcode 5203 /// 5204 /// 2. \code 5205 /// csinc w9, wzr, wzr, <condition code> 5206 /// tbz w9, #0, 0x44 5207 /// \endcode 5208 /// to 5209 /// \code 5210 /// b.<condition code> 5211 /// \endcode 5212 /// 5213 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 5214 /// compare's constant operand is power of 2. 5215 /// 5216 /// Examples: 5217 /// \code 5218 /// and w8, w8, #0x400 5219 /// cbnz w8, L1 5220 /// \endcode 5221 /// to 5222 /// \code 5223 /// tbnz w8, #10, L1 5224 /// \endcode 5225 /// 5226 /// \param MI Conditional Branch 5227 /// \return True when the simple conditional branch is generated 5228 /// 5229 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 5230 bool IsNegativeBranch = false; 5231 bool IsTestAndBranch = false; 5232 unsigned TargetBBInMI = 0; 5233 switch (MI.getOpcode()) { 5234 default: 5235 llvm_unreachable("Unknown branch instruction?"); 5236 case AArch64::Bcc: 5237 return false; 5238 case AArch64::CBZW: 5239 case AArch64::CBZX: 5240 TargetBBInMI = 1; 5241 break; 5242 case AArch64::CBNZW: 5243 case AArch64::CBNZX: 5244 TargetBBInMI = 1; 5245 IsNegativeBranch = true; 5246 break; 5247 case AArch64::TBZW: 5248 case AArch64::TBZX: 5249 TargetBBInMI = 2; 5250 IsTestAndBranch = true; 5251 break; 5252 case AArch64::TBNZW: 5253 case AArch64::TBNZX: 5254 TargetBBInMI = 2; 5255 IsNegativeBranch = true; 5256 IsTestAndBranch = true; 5257 break; 5258 } 5259 // So we increment a zero register and test for bits other 5260 // than bit 0? Conservatively bail out in case the verifier 5261 // missed this case. 5262 if (IsTestAndBranch && MI.getOperand(1).getImm()) 5263 return false; 5264 5265 // Find Definition. 5266 assert(MI.getParent() && "Incomplete machine instruciton\n"); 5267 MachineBasicBlock *MBB = MI.getParent(); 5268 MachineFunction *MF = MBB->getParent(); 5269 MachineRegisterInfo *MRI = &MF->getRegInfo(); 5270 Register VReg = MI.getOperand(0).getReg(); 5271 if (!Register::isVirtualRegister(VReg)) 5272 return false; 5273 5274 MachineInstr *DefMI = MRI->getVRegDef(VReg); 5275 5276 // Look through COPY instructions to find definition. 5277 while (DefMI->isCopy()) { 5278 Register CopyVReg = DefMI->getOperand(1).getReg(); 5279 if (!MRI->hasOneNonDBGUse(CopyVReg)) 5280 return false; 5281 if (!MRI->hasOneDef(CopyVReg)) 5282 return false; 5283 DefMI = MRI->getVRegDef(CopyVReg); 5284 } 5285 5286 switch (DefMI->getOpcode()) { 5287 default: 5288 return false; 5289 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 5290 case AArch64::ANDWri: 5291 case AArch64::ANDXri: { 5292 if (IsTestAndBranch) 5293 return false; 5294 if (DefMI->getParent() != MBB) 5295 return false; 5296 if (!MRI->hasOneNonDBGUse(VReg)) 5297 return false; 5298 5299 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 5300 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 5301 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 5302 if (!isPowerOf2_64(Mask)) 5303 return false; 5304 5305 MachineOperand &MO = DefMI->getOperand(1); 5306 Register NewReg = MO.getReg(); 5307 if (!Register::isVirtualRegister(NewReg)) 5308 return false; 5309 5310 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 5311 5312 MachineBasicBlock &RefToMBB = *MBB; 5313 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 5314 DebugLoc DL = MI.getDebugLoc(); 5315 unsigned Imm = Log2_64(Mask); 5316 unsigned Opc = (Imm < 32) 5317 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 5318 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 5319 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 5320 .addReg(NewReg) 5321 .addImm(Imm) 5322 .addMBB(TBB); 5323 // Register lives on to the CBZ now. 5324 MO.setIsKill(false); 5325 5326 // For immediate smaller than 32, we need to use the 32-bit 5327 // variant (W) in all cases. Indeed the 64-bit variant does not 5328 // allow to encode them. 5329 // Therefore, if the input register is 64-bit, we need to take the 5330 // 32-bit sub-part. 5331 if (!Is32Bit && Imm < 32) 5332 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 5333 MI.eraseFromParent(); 5334 return true; 5335 } 5336 // Look for CSINC 5337 case AArch64::CSINCWr: 5338 case AArch64::CSINCXr: { 5339 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 5340 DefMI->getOperand(2).getReg() == AArch64::WZR) && 5341 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 5342 DefMI->getOperand(2).getReg() == AArch64::XZR)) 5343 return false; 5344 5345 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 5346 return false; 5347 5348 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 5349 // Convert only when the condition code is not modified between 5350 // the CSINC and the branch. The CC may be used by other 5351 // instructions in between. 5352 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 5353 return false; 5354 MachineBasicBlock &RefToMBB = *MBB; 5355 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 5356 DebugLoc DL = MI.getDebugLoc(); 5357 if (IsNegativeBranch) 5358 CC = AArch64CC::getInvertedCondCode(CC); 5359 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 5360 MI.eraseFromParent(); 5361 return true; 5362 } 5363 } 5364 } 5365 5366 std::pair<unsigned, unsigned> 5367 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 5368 const unsigned Mask = AArch64II::MO_FRAGMENT; 5369 return std::make_pair(TF & Mask, TF & ~Mask); 5370 } 5371 5372 ArrayRef<std::pair<unsigned, const char *>> 5373 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 5374 using namespace AArch64II; 5375 5376 static const std::pair<unsigned, const char *> TargetFlags[] = { 5377 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 5378 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 5379 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 5380 {MO_HI12, "aarch64-hi12"}}; 5381 return makeArrayRef(TargetFlags); 5382 } 5383 5384 ArrayRef<std::pair<unsigned, const char *>> 5385 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 5386 using namespace AArch64II; 5387 5388 static const std::pair<unsigned, const char *> TargetFlags[] = { 5389 {MO_COFFSTUB, "aarch64-coffstub"}, 5390 {MO_GOT, "aarch64-got"}, 5391 {MO_NC, "aarch64-nc"}, 5392 {MO_S, "aarch64-s"}, 5393 {MO_TLS, "aarch64-tls"}, 5394 {MO_DLLIMPORT, "aarch64-dllimport"}, 5395 {MO_PREL, "aarch64-prel"}, 5396 {MO_TAGGED, "aarch64-tagged"}}; 5397 return makeArrayRef(TargetFlags); 5398 } 5399 5400 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 5401 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 5402 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 5403 {{MOSuppressPair, "aarch64-suppress-pair"}, 5404 {MOStridedAccess, "aarch64-strided-access"}}; 5405 return makeArrayRef(TargetFlags); 5406 } 5407 5408 /// Constants defining how certain sequences should be outlined. 5409 /// This encompasses how an outlined function should be called, and what kind of 5410 /// frame should be emitted for that outlined function. 5411 /// 5412 /// \p MachineOutlinerDefault implies that the function should be called with 5413 /// a save and restore of LR to the stack. 5414 /// 5415 /// That is, 5416 /// 5417 /// I1 Save LR OUTLINED_FUNCTION: 5418 /// I2 --> BL OUTLINED_FUNCTION I1 5419 /// I3 Restore LR I2 5420 /// I3 5421 /// RET 5422 /// 5423 /// * Call construction overhead: 3 (save + BL + restore) 5424 /// * Frame construction overhead: 1 (ret) 5425 /// * Requires stack fixups? Yes 5426 /// 5427 /// \p MachineOutlinerTailCall implies that the function is being created from 5428 /// a sequence of instructions ending in a return. 5429 /// 5430 /// That is, 5431 /// 5432 /// I1 OUTLINED_FUNCTION: 5433 /// I2 --> B OUTLINED_FUNCTION I1 5434 /// RET I2 5435 /// RET 5436 /// 5437 /// * Call construction overhead: 1 (B) 5438 /// * Frame construction overhead: 0 (Return included in sequence) 5439 /// * Requires stack fixups? No 5440 /// 5441 /// \p MachineOutlinerNoLRSave implies that the function should be called using 5442 /// a BL instruction, but doesn't require LR to be saved and restored. This 5443 /// happens when LR is known to be dead. 5444 /// 5445 /// That is, 5446 /// 5447 /// I1 OUTLINED_FUNCTION: 5448 /// I2 --> BL OUTLINED_FUNCTION I1 5449 /// I3 I2 5450 /// I3 5451 /// RET 5452 /// 5453 /// * Call construction overhead: 1 (BL) 5454 /// * Frame construction overhead: 1 (RET) 5455 /// * Requires stack fixups? No 5456 /// 5457 /// \p MachineOutlinerThunk implies that the function is being created from 5458 /// a sequence of instructions ending in a call. The outlined function is 5459 /// called with a BL instruction, and the outlined function tail-calls the 5460 /// original call destination. 5461 /// 5462 /// That is, 5463 /// 5464 /// I1 OUTLINED_FUNCTION: 5465 /// I2 --> BL OUTLINED_FUNCTION I1 5466 /// BL f I2 5467 /// B f 5468 /// * Call construction overhead: 1 (BL) 5469 /// * Frame construction overhead: 0 5470 /// * Requires stack fixups? No 5471 /// 5472 /// \p MachineOutlinerRegSave implies that the function should be called with a 5473 /// save and restore of LR to an available register. This allows us to avoid 5474 /// stack fixups. Note that this outlining variant is compatible with the 5475 /// NoLRSave case. 5476 /// 5477 /// That is, 5478 /// 5479 /// I1 Save LR OUTLINED_FUNCTION: 5480 /// I2 --> BL OUTLINED_FUNCTION I1 5481 /// I3 Restore LR I2 5482 /// I3 5483 /// RET 5484 /// 5485 /// * Call construction overhead: 3 (save + BL + restore) 5486 /// * Frame construction overhead: 1 (ret) 5487 /// * Requires stack fixups? No 5488 enum MachineOutlinerClass { 5489 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 5490 MachineOutlinerTailCall, /// Only emit a branch. 5491 MachineOutlinerNoLRSave, /// Emit a call and return. 5492 MachineOutlinerThunk, /// Emit a call and tail-call. 5493 MachineOutlinerRegSave /// Same as default, but save to a register. 5494 }; 5495 5496 enum MachineOutlinerMBBFlags { 5497 LRUnavailableSomewhere = 0x2, 5498 HasCalls = 0x4, 5499 UnsafeRegsDead = 0x8 5500 }; 5501 5502 unsigned 5503 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 5504 assert(C.LRUWasSet && "LRU wasn't set?"); 5505 MachineFunction *MF = C.getMF(); 5506 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 5507 MF->getSubtarget().getRegisterInfo()); 5508 5509 // Check if there is an available register across the sequence that we can 5510 // use. 5511 for (unsigned Reg : AArch64::GPR64RegClass) { 5512 if (!ARI->isReservedReg(*MF, Reg) && 5513 Reg != AArch64::LR && // LR is not reserved, but don't use it. 5514 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 5515 Reg != AArch64::X17 && // Ditto for X17. 5516 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 5517 return Reg; 5518 } 5519 5520 // No suitable register. Return 0. 5521 return 0u; 5522 } 5523 5524 static bool 5525 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 5526 const outliner::Candidate &b) { 5527 const Function &Fa = a.getMF()->getFunction(); 5528 const Function &Fb = b.getMF()->getFunction(); 5529 5530 // If none of the functions have the "sign-return-address" attribute their 5531 // signing behaviour is equal 5532 if (!Fa.hasFnAttribute("sign-return-address") && 5533 !Fb.hasFnAttribute("sign-return-address")) { 5534 return true; 5535 } 5536 5537 // If both functions have the "sign-return-address" attribute their signing 5538 // behaviour is equal, if the values of the attributes are equal 5539 if (Fa.hasFnAttribute("sign-return-address") && 5540 Fb.hasFnAttribute("sign-return-address")) { 5541 StringRef ScopeA = 5542 Fa.getFnAttribute("sign-return-address").getValueAsString(); 5543 StringRef ScopeB = 5544 Fb.getFnAttribute("sign-return-address").getValueAsString(); 5545 return ScopeA.equals(ScopeB); 5546 } 5547 5548 // If function B doesn't have the "sign-return-address" attribute but A does, 5549 // the functions' signing behaviour is equal if A's value for 5550 // "sign-return-address" is "none" and vice versa. 5551 if (Fa.hasFnAttribute("sign-return-address")) { 5552 StringRef ScopeA = 5553 Fa.getFnAttribute("sign-return-address").getValueAsString(); 5554 return ScopeA.equals("none"); 5555 } 5556 5557 if (Fb.hasFnAttribute("sign-return-address")) { 5558 StringRef ScopeB = 5559 Fb.getFnAttribute("sign-return-address").getValueAsString(); 5560 return ScopeB.equals("none"); 5561 } 5562 5563 llvm_unreachable("Unkown combination of sign-return-address attributes"); 5564 } 5565 5566 static bool 5567 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 5568 const outliner::Candidate &b) { 5569 const Function &Fa = a.getMF()->getFunction(); 5570 const Function &Fb = b.getMF()->getFunction(); 5571 5572 // If none of the functions have the "sign-return-address-key" attribute 5573 // their keys are equal 5574 if (!Fa.hasFnAttribute("sign-return-address-key") && 5575 !Fb.hasFnAttribute("sign-return-address-key")) { 5576 return true; 5577 } 5578 5579 // If both functions have the "sign-return-address-key" attribute their 5580 // keys are equal if the values of "sign-return-address-key" are equal 5581 if (Fa.hasFnAttribute("sign-return-address-key") && 5582 Fb.hasFnAttribute("sign-return-address-key")) { 5583 StringRef KeyA = 5584 Fa.getFnAttribute("sign-return-address-key").getValueAsString(); 5585 StringRef KeyB = 5586 Fb.getFnAttribute("sign-return-address-key").getValueAsString(); 5587 return KeyA.equals(KeyB); 5588 } 5589 5590 // If B doesn't have the "sign-return-address-key" attribute, both keys are 5591 // equal, if function a has the default key (a_key) 5592 if (Fa.hasFnAttribute("sign-return-address-key")) { 5593 StringRef KeyA = 5594 Fa.getFnAttribute("sign-return-address-key").getValueAsString(); 5595 return KeyA.equals_lower("a_key"); 5596 } 5597 5598 if (Fb.hasFnAttribute("sign-return-address-key")) { 5599 StringRef KeyB = 5600 Fb.getFnAttribute("sign-return-address-key").getValueAsString(); 5601 return KeyB.equals_lower("a_key"); 5602 } 5603 5604 llvm_unreachable("Unkown combination of sign-return-address-key attributes"); 5605 } 5606 5607 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 5608 const outliner::Candidate &b) { 5609 const AArch64Subtarget &SubtargetA = 5610 a.getMF()->getSubtarget<AArch64Subtarget>(); 5611 const AArch64Subtarget &SubtargetB = 5612 b.getMF()->getSubtarget<AArch64Subtarget>(); 5613 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 5614 } 5615 5616 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 5617 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 5618 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 5619 unsigned SequenceSize = 5620 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 5621 [this](unsigned Sum, const MachineInstr &MI) { 5622 return Sum + getInstSizeInBytes(MI); 5623 }); 5624 unsigned NumBytesToCreateFrame = 0; 5625 5626 // We only allow outlining for functions having exactly matching return 5627 // address signing attributes, i.e., all share the same value for the 5628 // attribute "sign-return-address" and all share the same type of key they 5629 // are signed with. 5630 // Additionally we require all functions to simultaniously either support 5631 // v8.3a features or not. Otherwise an outlined function could get signed 5632 // using dedicated v8.3 instructions and a call from a function that doesn't 5633 // support v8.3 instructions would therefore be invalid. 5634 if (std::adjacent_find( 5635 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 5636 [](const outliner::Candidate &a, const outliner::Candidate &b) { 5637 // Return true if a and b are non-equal w.r.t. return address 5638 // signing or support of v8.3a features 5639 if (outliningCandidatesSigningScopeConsensus(a, b) && 5640 outliningCandidatesSigningKeyConsensus(a, b) && 5641 outliningCandidatesV8_3OpsConsensus(a, b)) { 5642 return false; 5643 } 5644 return true; 5645 }) != RepeatedSequenceLocs.end()) { 5646 return outliner::OutlinedFunction(); 5647 } 5648 5649 // Since at this point all candidates agree on their return address signing 5650 // picking just one is fine. If the candidate functions potentially sign their 5651 // return addresses, the outlined function should do the same. Note that in 5652 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 5653 // not certainly true that the outlined function will have to sign its return 5654 // address but this decision is made later, when the decision to outline 5655 // has already been made. 5656 // The same holds for the number of additional instructions we need: On 5657 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 5658 // necessary. However, at this point we don't know if the outlined function 5659 // will have a RET instruction so we assume the worst. 5660 const Function &FCF = FirstCand.getMF()->getFunction(); 5661 const TargetRegisterInfo &TRI = getRegisterInfo(); 5662 if (FCF.hasFnAttribute("sign-return-address")) { 5663 // One PAC and one AUT instructions 5664 NumBytesToCreateFrame += 8; 5665 5666 // We have to check if sp modifying instructions would get outlined. 5667 // If so we only allow outlining if sp is unchanged overall, so matching 5668 // sub and add instructions are okay to outline, all other sp modifications 5669 // are not 5670 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 5671 int SPValue = 0; 5672 MachineBasicBlock::iterator MBBI = C.front(); 5673 for (;;) { 5674 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 5675 switch (MBBI->getOpcode()) { 5676 case AArch64::ADDXri: 5677 case AArch64::ADDWri: 5678 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 5679 assert(MBBI->getOperand(2).isImm() && 5680 "Expected operand to be immediate"); 5681 assert(MBBI->getOperand(1).isReg() && 5682 "Expected operand to be a register"); 5683 // Check if the add just increments sp. If so, we search for 5684 // matching sub instructions that decrement sp. If not, the 5685 // modification is illegal 5686 if (MBBI->getOperand(1).getReg() == AArch64::SP) 5687 SPValue += MBBI->getOperand(2).getImm(); 5688 else 5689 return true; 5690 break; 5691 case AArch64::SUBXri: 5692 case AArch64::SUBWri: 5693 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 5694 assert(MBBI->getOperand(2).isImm() && 5695 "Expected operand to be immediate"); 5696 assert(MBBI->getOperand(1).isReg() && 5697 "Expected operand to be a register"); 5698 // Check if the sub just decrements sp. If so, we search for 5699 // matching add instructions that increment sp. If not, the 5700 // modification is illegal 5701 if (MBBI->getOperand(1).getReg() == AArch64::SP) 5702 SPValue -= MBBI->getOperand(2).getImm(); 5703 else 5704 return true; 5705 break; 5706 default: 5707 return true; 5708 } 5709 } 5710 if (MBBI == C.back()) 5711 break; 5712 ++MBBI; 5713 } 5714 if (SPValue) 5715 return true; 5716 return false; 5717 }; 5718 // Remove candidates with illegal stack modifying instructions 5719 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), 5720 RepeatedSequenceLocs.end(), 5721 hasIllegalSPModification), 5722 RepeatedSequenceLocs.end()); 5723 5724 // If the sequence doesn't have enough candidates left, then we're done. 5725 if (RepeatedSequenceLocs.size() < 2) 5726 return outliner::OutlinedFunction(); 5727 } 5728 5729 // Properties about candidate MBBs that hold for all of them. 5730 unsigned FlagsSetInAll = 0xF; 5731 5732 // Compute liveness information for each candidate, and set FlagsSetInAll. 5733 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 5734 [&FlagsSetInAll](outliner::Candidate &C) { 5735 FlagsSetInAll &= C.Flags; 5736 }); 5737 5738 // According to the AArch64 Procedure Call Standard, the following are 5739 // undefined on entry/exit from a function call: 5740 // 5741 // * Registers x16, x17, (and thus w16, w17) 5742 // * Condition codes (and thus the NZCV register) 5743 // 5744 // Because if this, we can't outline any sequence of instructions where 5745 // one 5746 // of these registers is live into/across it. Thus, we need to delete 5747 // those 5748 // candidates. 5749 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 5750 // If the unsafe registers in this block are all dead, then we don't need 5751 // to compute liveness here. 5752 if (C.Flags & UnsafeRegsDead) 5753 return false; 5754 C.initLRU(TRI); 5755 LiveRegUnits LRU = C.LRU; 5756 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 5757 !LRU.available(AArch64::NZCV)); 5758 }; 5759 5760 // Are there any candidates where those registers are live? 5761 if (!(FlagsSetInAll & UnsafeRegsDead)) { 5762 // Erase every candidate that violates the restrictions above. (It could be 5763 // true that we have viable candidates, so it's not worth bailing out in 5764 // the case that, say, 1 out of 20 candidates violate the restructions.) 5765 RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), 5766 RepeatedSequenceLocs.end(), 5767 CantGuaranteeValueAcrossCall), 5768 RepeatedSequenceLocs.end()); 5769 5770 // If the sequence doesn't have enough candidates left, then we're done. 5771 if (RepeatedSequenceLocs.size() < 2) 5772 return outliner::OutlinedFunction(); 5773 } 5774 5775 // At this point, we have only "safe" candidates to outline. Figure out 5776 // frame + call instruction information. 5777 5778 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 5779 5780 // Helper lambda which sets call information for every candidate. 5781 auto SetCandidateCallInfo = 5782 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 5783 for (outliner::Candidate &C : RepeatedSequenceLocs) 5784 C.setCallInfo(CallID, NumBytesForCall); 5785 }; 5786 5787 unsigned FrameID = MachineOutlinerDefault; 5788 NumBytesToCreateFrame += 4; 5789 5790 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 5791 return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); 5792 }); 5793 5794 // Returns true if an instructions is safe to fix up, false otherwise. 5795 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 5796 if (MI.isCall()) 5797 return true; 5798 5799 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 5800 !MI.readsRegister(AArch64::SP, &TRI)) 5801 return true; 5802 5803 // Any modification of SP will break our code to save/restore LR. 5804 // FIXME: We could handle some instructions which add a constant 5805 // offset to SP, with a bit more work. 5806 if (MI.modifiesRegister(AArch64::SP, &TRI)) 5807 return false; 5808 5809 // At this point, we have a stack instruction that we might need to 5810 // fix up. We'll handle it if it's a load or store. 5811 if (MI.mayLoadOrStore()) { 5812 const MachineOperand *Base; // Filled with the base operand of MI. 5813 int64_t Offset; // Filled with the offset of MI. 5814 5815 // Does it allow us to offset the base operand and is the base the 5816 // register SP? 5817 if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() || 5818 Base->getReg() != AArch64::SP) 5819 return false; 5820 5821 // Find the minimum/maximum offset for this instruction and check 5822 // if fixing it up would be in range. 5823 int64_t MinOffset, 5824 MaxOffset; // Unscaled offsets for the instruction. 5825 unsigned Scale; // The scale to multiply the offsets by. 5826 unsigned DummyWidth; 5827 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 5828 5829 Offset += 16; // Update the offset to what it would be if we outlined. 5830 if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale) 5831 return false; 5832 5833 // It's in range, so we can outline it. 5834 return true; 5835 } 5836 5837 // FIXME: Add handling for instructions like "add x0, sp, #8". 5838 5839 // We can't fix it up, so don't outline it. 5840 return false; 5841 }; 5842 5843 // True if it's possible to fix up each stack instruction in this sequence. 5844 // Important for frames/call variants that modify the stack. 5845 bool AllStackInstrsSafe = std::all_of( 5846 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 5847 5848 // If the last instruction in any candidate is a terminator, then we should 5849 // tail call all of the candidates. 5850 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 5851 FrameID = MachineOutlinerTailCall; 5852 NumBytesToCreateFrame = 0; 5853 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 5854 } 5855 5856 else if (LastInstrOpcode == AArch64::BL || 5857 (LastInstrOpcode == AArch64::BLR && !HasBTI)) { 5858 // FIXME: Do we need to check if the code after this uses the value of LR? 5859 FrameID = MachineOutlinerThunk; 5860 NumBytesToCreateFrame = 0; 5861 SetCandidateCallInfo(MachineOutlinerThunk, 4); 5862 } 5863 5864 else { 5865 // We need to decide how to emit calls + frames. We can always emit the same 5866 // frame if we don't need to save to the stack. If we have to save to the 5867 // stack, then we need a different frame. 5868 unsigned NumBytesNoStackCalls = 0; 5869 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 5870 5871 // Check if we have to save LR. 5872 for (outliner::Candidate &C : RepeatedSequenceLocs) { 5873 C.initLRU(TRI); 5874 5875 // If we have a noreturn caller, then we're going to be conservative and 5876 // say that we have to save LR. If we don't have a ret at the end of the 5877 // block, then we can't reason about liveness accurately. 5878 // 5879 // FIXME: We can probably do better than always disabling this in 5880 // noreturn functions by fixing up the liveness info. 5881 bool IsNoReturn = 5882 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 5883 5884 // Is LR available? If so, we don't need a save. 5885 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 5886 NumBytesNoStackCalls += 4; 5887 C.setCallInfo(MachineOutlinerNoLRSave, 4); 5888 CandidatesWithoutStackFixups.push_back(C); 5889 } 5890 5891 // Is an unused register available? If so, we won't modify the stack, so 5892 // we can outline with the same frame type as those that don't save LR. 5893 else if (findRegisterToSaveLRTo(C)) { 5894 NumBytesNoStackCalls += 12; 5895 C.setCallInfo(MachineOutlinerRegSave, 12); 5896 CandidatesWithoutStackFixups.push_back(C); 5897 } 5898 5899 // Is SP used in the sequence at all? If not, we don't have to modify 5900 // the stack, so we are guaranteed to get the same frame. 5901 else if (C.UsedInSequence.available(AArch64::SP)) { 5902 NumBytesNoStackCalls += 12; 5903 C.setCallInfo(MachineOutlinerDefault, 12); 5904 CandidatesWithoutStackFixups.push_back(C); 5905 } 5906 5907 // If we outline this, we need to modify the stack. Pretend we don't 5908 // outline this by saving all of its bytes. 5909 else { 5910 NumBytesNoStackCalls += SequenceSize; 5911 } 5912 } 5913 5914 // If there are no places where we have to save LR, then note that we 5915 // don't have to update the stack. Otherwise, give every candidate the 5916 // default call type, as long as it's safe to do so. 5917 if (!AllStackInstrsSafe || 5918 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 5919 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 5920 FrameID = MachineOutlinerNoLRSave; 5921 } else { 5922 SetCandidateCallInfo(MachineOutlinerDefault, 12); 5923 } 5924 5925 // If we dropped all of the candidates, bail out here. 5926 if (RepeatedSequenceLocs.size() < 2) { 5927 RepeatedSequenceLocs.clear(); 5928 return outliner::OutlinedFunction(); 5929 } 5930 } 5931 5932 // Does every candidate's MBB contain a call? If so, then we might have a call 5933 // in the range. 5934 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 5935 // Check if the range contains a call. These require a save + restore of the 5936 // link register. 5937 bool ModStackToSaveLR = false; 5938 if (std::any_of(FirstCand.front(), FirstCand.back(), 5939 [](const MachineInstr &MI) { return MI.isCall(); })) 5940 ModStackToSaveLR = true; 5941 5942 // Handle the last instruction separately. If this is a tail call, then the 5943 // last instruction is a call. We don't want to save + restore in this case. 5944 // However, it could be possible that the last instruction is a call without 5945 // it being valid to tail call this sequence. We should consider this as 5946 // well. 5947 else if (FrameID != MachineOutlinerThunk && 5948 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 5949 ModStackToSaveLR = true; 5950 5951 if (ModStackToSaveLR) { 5952 // We can't fix up the stack. Bail out. 5953 if (!AllStackInstrsSafe) { 5954 RepeatedSequenceLocs.clear(); 5955 return outliner::OutlinedFunction(); 5956 } 5957 5958 // Save + restore LR. 5959 NumBytesToCreateFrame += 8; 5960 } 5961 } 5962 5963 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 5964 NumBytesToCreateFrame, FrameID); 5965 } 5966 5967 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 5968 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 5969 const Function &F = MF.getFunction(); 5970 5971 // Can F be deduplicated by the linker? If it can, don't outline from it. 5972 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 5973 return false; 5974 5975 // Don't outline from functions with section markings; the program could 5976 // expect that all the code is in the named section. 5977 // FIXME: Allow outlining from multiple functions with the same section 5978 // marking. 5979 if (F.hasSection()) 5980 return false; 5981 5982 // Outlining from functions with redzones is unsafe since the outliner may 5983 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 5984 // outline from it. 5985 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 5986 if (!AFI || AFI->hasRedZone().getValueOr(true)) 5987 return false; 5988 5989 // It's safe to outline from MF. 5990 return true; 5991 } 5992 5993 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 5994 unsigned &Flags) const { 5995 // Check if LR is available through all of the MBB. If it's not, then set 5996 // a flag. 5997 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 5998 "Suitable Machine Function for outlining must track liveness"); 5999 LiveRegUnits LRU(getRegisterInfo()); 6000 6001 std::for_each(MBB.rbegin(), MBB.rend(), 6002 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6003 6004 // Check if each of the unsafe registers are available... 6005 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6006 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6007 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6008 6009 // If all of these are dead (and not live out), we know we don't have to check 6010 // them later. 6011 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6012 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6013 6014 // Now, add the live outs to the set. 6015 LRU.addLiveOuts(MBB); 6016 6017 // If any of these registers is available in the MBB, but also a live out of 6018 // the block, then we know outlining is unsafe. 6019 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6020 return false; 6021 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6022 return false; 6023 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6024 return false; 6025 6026 // Check if there's a call inside this MachineBasicBlock. If there is, then 6027 // set a flag. 6028 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6029 Flags |= MachineOutlinerMBBFlags::HasCalls; 6030 6031 MachineFunction *MF = MBB.getParent(); 6032 6033 // In the event that we outline, we may have to save LR. If there is an 6034 // available register in the MBB, then we'll always save LR there. Check if 6035 // this is true. 6036 bool CanSaveLR = false; 6037 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6038 MF->getSubtarget().getRegisterInfo()); 6039 6040 // Check if there is an available register across the sequence that we can 6041 // use. 6042 for (unsigned Reg : AArch64::GPR64RegClass) { 6043 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6044 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6045 CanSaveLR = true; 6046 break; 6047 } 6048 } 6049 6050 // Check if we have a register we can save LR to, and if LR was used 6051 // somewhere. If both of those things are true, then we need to evaluate the 6052 // safety of outlining stack instructions later. 6053 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6054 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6055 6056 return true; 6057 } 6058 6059 outliner::InstrType 6060 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6061 unsigned Flags) const { 6062 MachineInstr &MI = *MIT; 6063 MachineBasicBlock *MBB = MI.getParent(); 6064 MachineFunction *MF = MBB->getParent(); 6065 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6066 6067 // Don't outline anything used for return address signing. The outlined 6068 // function will get signed later if needed 6069 switch (MI.getOpcode()) { 6070 case AArch64::PACIASP: 6071 case AArch64::PACIBSP: 6072 case AArch64::AUTIASP: 6073 case AArch64::AUTIBSP: 6074 case AArch64::RETAA: 6075 case AArch64::RETAB: 6076 case AArch64::EMITBKEY: 6077 return outliner::InstrType::Illegal; 6078 } 6079 6080 // Don't outline LOHs. 6081 if (FuncInfo->getLOHRelated().count(&MI)) 6082 return outliner::InstrType::Illegal; 6083 6084 // Don't allow debug values to impact outlining type. 6085 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 6086 return outliner::InstrType::Invisible; 6087 6088 // At this point, KILL instructions don't really tell us much so we can go 6089 // ahead and skip over them. 6090 if (MI.isKill()) 6091 return outliner::InstrType::Invisible; 6092 6093 // Is this a terminator for a basic block? 6094 if (MI.isTerminator()) { 6095 6096 // Is this the end of a function? 6097 if (MI.getParent()->succ_empty()) 6098 return outliner::InstrType::Legal; 6099 6100 // It's not, so don't outline it. 6101 return outliner::InstrType::Illegal; 6102 } 6103 6104 // Make sure none of the operands are un-outlinable. 6105 for (const MachineOperand &MOP : MI.operands()) { 6106 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 6107 MOP.isTargetIndex()) 6108 return outliner::InstrType::Illegal; 6109 6110 // If it uses LR or W30 explicitly, then don't touch it. 6111 if (MOP.isReg() && !MOP.isImplicit() && 6112 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 6113 return outliner::InstrType::Illegal; 6114 } 6115 6116 // Special cases for instructions that can always be outlined, but will fail 6117 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 6118 // be outlined because they don't require a *specific* value to be in LR. 6119 if (MI.getOpcode() == AArch64::ADRP) 6120 return outliner::InstrType::Legal; 6121 6122 // If MI is a call we might be able to outline it. We don't want to outline 6123 // any calls that rely on the position of items on the stack. When we outline 6124 // something containing a call, we have to emit a save and restore of LR in 6125 // the outlined function. Currently, this always happens by saving LR to the 6126 // stack. Thus, if we outline, say, half the parameters for a function call 6127 // plus the call, then we'll break the callee's expectations for the layout 6128 // of the stack. 6129 // 6130 // FIXME: Allow calls to functions which construct a stack frame, as long 6131 // as they don't access arguments on the stack. 6132 // FIXME: Figure out some way to analyze functions defined in other modules. 6133 // We should be able to compute the memory usage based on the IR calling 6134 // convention, even if we can't see the definition. 6135 if (MI.isCall()) { 6136 // Get the function associated with the call. Look at each operand and find 6137 // the one that represents the callee and get its name. 6138 const Function *Callee = nullptr; 6139 for (const MachineOperand &MOP : MI.operands()) { 6140 if (MOP.isGlobal()) { 6141 Callee = dyn_cast<Function>(MOP.getGlobal()); 6142 break; 6143 } 6144 } 6145 6146 // Never outline calls to mcount. There isn't any rule that would require 6147 // this, but the Linux kernel's "ftrace" feature depends on it. 6148 if (Callee && Callee->getName() == "\01_mcount") 6149 return outliner::InstrType::Illegal; 6150 6151 // If we don't know anything about the callee, assume it depends on the 6152 // stack layout of the caller. In that case, it's only legal to outline 6153 // as a tail-call. Whitelist the call instructions we know about so we 6154 // don't get unexpected results with call pseudo-instructions. 6155 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 6156 if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) 6157 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 6158 6159 if (!Callee) 6160 return UnknownCallOutlineType; 6161 6162 // We have a function we have information about. Check it if it's something 6163 // can safely outline. 6164 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 6165 6166 // We don't know what's going on with the callee at all. Don't touch it. 6167 if (!CalleeMF) 6168 return UnknownCallOutlineType; 6169 6170 // Check if we know anything about the callee saves on the function. If we 6171 // don't, then don't touch it, since that implies that we haven't 6172 // computed anything about its stack frame yet. 6173 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 6174 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 6175 MFI.getNumObjects() > 0) 6176 return UnknownCallOutlineType; 6177 6178 // At this point, we can say that CalleeMF ought to not pass anything on the 6179 // stack. Therefore, we can outline it. 6180 return outliner::InstrType::Legal; 6181 } 6182 6183 // Don't outline positions. 6184 if (MI.isPosition()) 6185 return outliner::InstrType::Illegal; 6186 6187 // Don't touch the link register or W30. 6188 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 6189 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 6190 return outliner::InstrType::Illegal; 6191 6192 // Don't outline BTI instructions, because that will prevent the outlining 6193 // site from being indirectly callable. 6194 if (MI.getOpcode() == AArch64::HINT) { 6195 int64_t Imm = MI.getOperand(0).getImm(); 6196 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 6197 return outliner::InstrType::Illegal; 6198 } 6199 6200 return outliner::InstrType::Legal; 6201 } 6202 6203 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 6204 for (MachineInstr &MI : MBB) { 6205 const MachineOperand *Base; 6206 unsigned Width; 6207 int64_t Offset; 6208 6209 // Is this a load or store with an immediate offset with SP as the base? 6210 if (!MI.mayLoadOrStore() || 6211 !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) || 6212 (Base->isReg() && Base->getReg() != AArch64::SP)) 6213 continue; 6214 6215 // It is, so we have to fix it up. 6216 unsigned Scale; 6217 int64_t Dummy1, Dummy2; 6218 6219 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 6220 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 6221 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 6222 assert(Scale != 0 && "Unexpected opcode!"); 6223 6224 // We've pushed the return address to the stack, so add 16 to the offset. 6225 // This is safe, since we already checked if it would overflow when we 6226 // checked if this instruction was legal to outline. 6227 int64_t NewImm = (Offset + 16) / Scale; 6228 StackOffsetOperand.setImm(NewImm); 6229 } 6230 } 6231 6232 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 6233 bool ShouldSignReturnAddr, 6234 bool ShouldSignReturnAddrWithAKey) { 6235 if (ShouldSignReturnAddr) { 6236 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 6237 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 6238 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 6239 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 6240 DebugLoc DL; 6241 6242 if (MBBAUT != MBB.end()) 6243 DL = MBBAUT->getDebugLoc(); 6244 6245 // At the very beginning of the basic block we insert the following 6246 // depending on the key type 6247 // 6248 // a_key: b_key: 6249 // PACIASP EMITBKEY 6250 // CFI_INSTRUCTION PACIBSP 6251 // CFI_INSTRUCTION 6252 if (ShouldSignReturnAddrWithAKey) { 6253 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP)) 6254 .setMIFlag(MachineInstr::FrameSetup); 6255 } else { 6256 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 6257 .setMIFlag(MachineInstr::FrameSetup); 6258 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP)) 6259 .setMIFlag(MachineInstr::FrameSetup); 6260 } 6261 unsigned CFIIndex = 6262 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 6263 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 6264 .addCFIIndex(CFIIndex) 6265 .setMIFlags(MachineInstr::FrameSetup); 6266 6267 // If v8.3a features are available we can replace a RET instruction by 6268 // RETAA or RETAB and omit the AUT instructions 6269 if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() && 6270 MBBAUT->getOpcode() == AArch64::RET) { 6271 BuildMI(MBB, MBBAUT, DL, 6272 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 6273 : AArch64::RETAB)) 6274 .copyImplicitOps(*MBBAUT); 6275 MBB.erase(MBBAUT); 6276 } else { 6277 BuildMI(MBB, MBBAUT, DL, 6278 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 6279 : AArch64::AUTIBSP)) 6280 .setMIFlag(MachineInstr::FrameDestroy); 6281 } 6282 } 6283 } 6284 6285 void AArch64InstrInfo::buildOutlinedFrame( 6286 MachineBasicBlock &MBB, MachineFunction &MF, 6287 const outliner::OutlinedFunction &OF) const { 6288 // For thunk outlining, rewrite the last instruction from a call to a 6289 // tail-call. 6290 if (OF.FrameConstructionID == MachineOutlinerThunk) { 6291 MachineInstr *Call = &*--MBB.instr_end(); 6292 unsigned TailOpcode; 6293 if (Call->getOpcode() == AArch64::BL) { 6294 TailOpcode = AArch64::TCRETURNdi; 6295 } else { 6296 assert(Call->getOpcode() == AArch64::BLR); 6297 TailOpcode = AArch64::TCRETURNriALL; 6298 } 6299 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 6300 .add(Call->getOperand(0)) 6301 .addImm(0); 6302 MBB.insert(MBB.end(), TC); 6303 Call->eraseFromParent(); 6304 } 6305 6306 bool IsLeafFunction = true; 6307 6308 // Is there a call in the outlined range? 6309 auto IsNonTailCall = [](const MachineInstr &MI) { 6310 return MI.isCall() && !MI.isReturn(); 6311 }; 6312 6313 if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { 6314 // Fix up the instructions in the range, since we're going to modify the 6315 // stack. 6316 assert(OF.FrameConstructionID != MachineOutlinerDefault && 6317 "Can only fix up stack references once"); 6318 fixupPostOutline(MBB); 6319 6320 IsLeafFunction = false; 6321 6322 // LR has to be a live in so that we can save it. 6323 MBB.addLiveIn(AArch64::LR); 6324 6325 MachineBasicBlock::iterator It = MBB.begin(); 6326 MachineBasicBlock::iterator Et = MBB.end(); 6327 6328 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6329 OF.FrameConstructionID == MachineOutlinerThunk) 6330 Et = std::prev(MBB.end()); 6331 6332 // Insert a save before the outlined region 6333 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 6334 .addReg(AArch64::SP, RegState::Define) 6335 .addReg(AArch64::LR) 6336 .addReg(AArch64::SP) 6337 .addImm(-16); 6338 It = MBB.insert(It, STRXpre); 6339 6340 const TargetSubtargetInfo &STI = MF.getSubtarget(); 6341 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 6342 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 6343 6344 // Add a CFI saying the stack was moved 16 B down. 6345 int64_t StackPosEntry = 6346 MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16)); 6347 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6348 .addCFIIndex(StackPosEntry) 6349 .setMIFlags(MachineInstr::FrameSetup); 6350 6351 // Add a CFI saying that the LR that we want to find is now 16 B higher than 6352 // before. 6353 int64_t LRPosEntry = 6354 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16)); 6355 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 6356 .addCFIIndex(LRPosEntry) 6357 .setMIFlags(MachineInstr::FrameSetup); 6358 6359 // Insert a restore before the terminator for the function. 6360 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 6361 .addReg(AArch64::SP, RegState::Define) 6362 .addReg(AArch64::LR, RegState::Define) 6363 .addReg(AArch64::SP) 6364 .addImm(16); 6365 Et = MBB.insert(Et, LDRXpost); 6366 } 6367 6368 // If a bunch of candidates reach this point they must agree on their return 6369 // address signing. It is therefore enough to just consider the signing 6370 // behaviour of one of them 6371 const Function &CF = OF.Candidates.front().getMF()->getFunction(); 6372 bool ShouldSignReturnAddr = false; 6373 if (CF.hasFnAttribute("sign-return-address")) { 6374 StringRef Scope = 6375 CF.getFnAttribute("sign-return-address").getValueAsString(); 6376 if (Scope.equals("all")) 6377 ShouldSignReturnAddr = true; 6378 else if (Scope.equals("non-leaf") && !IsLeafFunction) 6379 ShouldSignReturnAddr = true; 6380 } 6381 6382 // a_key is the default 6383 bool ShouldSignReturnAddrWithAKey = true; 6384 if (CF.hasFnAttribute("sign-return-address-key")) { 6385 const StringRef Key = 6386 CF.getFnAttribute("sign-return-address-key").getValueAsString(); 6387 // Key can either be a_key or b_key 6388 assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) && 6389 "Return address signing key must be either a_key or b_key"); 6390 ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key"); 6391 } 6392 6393 // If this is a tail call outlined function, then there's already a return. 6394 if (OF.FrameConstructionID == MachineOutlinerTailCall || 6395 OF.FrameConstructionID == MachineOutlinerThunk) { 6396 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6397 ShouldSignReturnAddrWithAKey); 6398 return; 6399 } 6400 6401 // It's not a tail call, so we have to insert the return ourselves. 6402 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 6403 .addReg(AArch64::LR, RegState::Undef); 6404 MBB.insert(MBB.end(), ret); 6405 6406 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 6407 ShouldSignReturnAddrWithAKey); 6408 6409 // Did we have to modify the stack by saving the link register? 6410 if (OF.FrameConstructionID != MachineOutlinerDefault) 6411 return; 6412 6413 // We modified the stack. 6414 // Walk over the basic block and fix up all the stack accesses. 6415 fixupPostOutline(MBB); 6416 } 6417 6418 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 6419 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 6420 MachineFunction &MF, const outliner::Candidate &C) const { 6421 6422 // Are we tail calling? 6423 if (C.CallConstructionID == MachineOutlinerTailCall) { 6424 // If yes, then we can just branch to the label. 6425 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 6426 .addGlobalAddress(M.getNamedValue(MF.getName())) 6427 .addImm(0)); 6428 return It; 6429 } 6430 6431 // Are we saving the link register? 6432 if (C.CallConstructionID == MachineOutlinerNoLRSave || 6433 C.CallConstructionID == MachineOutlinerThunk) { 6434 // No, so just insert the call. 6435 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 6436 .addGlobalAddress(M.getNamedValue(MF.getName()))); 6437 return It; 6438 } 6439 6440 // We want to return the spot where we inserted the call. 6441 MachineBasicBlock::iterator CallPt; 6442 6443 // Instructions for saving and restoring LR around the call instruction we're 6444 // going to insert. 6445 MachineInstr *Save; 6446 MachineInstr *Restore; 6447 // Can we save to a register? 6448 if (C.CallConstructionID == MachineOutlinerRegSave) { 6449 // FIXME: This logic should be sunk into a target-specific interface so that 6450 // we don't have to recompute the register. 6451 unsigned Reg = findRegisterToSaveLRTo(C); 6452 assert(Reg != 0 && "No callee-saved register available?"); 6453 6454 // Save and restore LR from that register. 6455 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 6456 .addReg(AArch64::XZR) 6457 .addReg(AArch64::LR) 6458 .addImm(0); 6459 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 6460 .addReg(AArch64::XZR) 6461 .addReg(Reg) 6462 .addImm(0); 6463 } else { 6464 // We have the default case. Save and restore from SP. 6465 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 6466 .addReg(AArch64::SP, RegState::Define) 6467 .addReg(AArch64::LR) 6468 .addReg(AArch64::SP) 6469 .addImm(-16); 6470 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 6471 .addReg(AArch64::SP, RegState::Define) 6472 .addReg(AArch64::LR, RegState::Define) 6473 .addReg(AArch64::SP) 6474 .addImm(16); 6475 } 6476 6477 It = MBB.insert(It, Save); 6478 It++; 6479 6480 // Insert the call. 6481 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 6482 .addGlobalAddress(M.getNamedValue(MF.getName()))); 6483 CallPt = It; 6484 It++; 6485 6486 It = MBB.insert(It, Restore); 6487 return CallPt; 6488 } 6489 6490 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 6491 MachineFunction &MF) const { 6492 return MF.getFunction().hasMinSize(); 6493 } 6494 6495 Optional<DestSourcePair> 6496 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 6497 6498 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 6499 // and zero immediate operands used as an alias for mov instruction. 6500 if (MI.getOpcode() == AArch64::ORRWrs && 6501 MI.getOperand(1).getReg() == AArch64::WZR && 6502 MI.getOperand(3).getImm() == 0x0) { 6503 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 6504 } 6505 6506 if (MI.getOpcode() == AArch64::ORRXrs && 6507 MI.getOperand(1).getReg() == AArch64::XZR && 6508 MI.getOperand(3).getImm() == 0x0) { 6509 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 6510 } 6511 6512 return None; 6513 } 6514 6515 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 6516 Register Reg) const { 6517 int Sign = 1; 6518 int64_t Offset = 0; 6519 6520 // TODO: Handle cases where Reg is a super- or sub-register of the 6521 // destination register. 6522 if (Reg != MI.getOperand(0).getReg()) 6523 return None; 6524 6525 switch (MI.getOpcode()) { 6526 default: 6527 return None; 6528 case AArch64::SUBWri: 6529 case AArch64::SUBXri: 6530 case AArch64::SUBSWri: 6531 case AArch64::SUBSXri: 6532 Sign *= -1; 6533 LLVM_FALLTHROUGH; 6534 case AArch64::ADDSWri: 6535 case AArch64::ADDSXri: 6536 case AArch64::ADDWri: 6537 case AArch64::ADDXri: { 6538 // TODO: Third operand can be global address (usually some string). 6539 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 6540 !MI.getOperand(2).isImm()) 6541 return None; 6542 Offset = MI.getOperand(2).getImm() * Sign; 6543 int Shift = MI.getOperand(3).getImm(); 6544 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 6545 Offset = Offset << Shift; 6546 } 6547 } 6548 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 6549 } 6550 6551 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 6552 /// the destination register then, if possible, describe the value in terms of 6553 /// the source register. 6554 static Optional<ParamLoadedValue> 6555 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 6556 const TargetInstrInfo *TII, 6557 const TargetRegisterInfo *TRI) { 6558 auto DestSrc = TII->isCopyInstr(MI); 6559 if (!DestSrc) 6560 return None; 6561 6562 Register DestReg = DestSrc->Destination->getReg(); 6563 Register SrcReg = DestSrc->Source->getReg(); 6564 6565 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 6566 6567 // If the described register is the destination, just return the source. 6568 if (DestReg == DescribedReg) 6569 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 6570 6571 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 6572 if (MI.getOpcode() == AArch64::ORRWrs && 6573 TRI->isSuperRegister(DestReg, DescribedReg)) 6574 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 6575 6576 // We may need to describe the lower part of a ORRXrs move. 6577 if (MI.getOpcode() == AArch64::ORRXrs && 6578 TRI->isSubRegister(DestReg, DescribedReg)) { 6579 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 6580 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 6581 } 6582 6583 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 6584 "Unhandled ORR[XW]rs copy case"); 6585 6586 return None; 6587 } 6588 6589 Optional<ParamLoadedValue> 6590 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 6591 Register Reg) const { 6592 const MachineFunction *MF = MI.getMF(); 6593 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 6594 switch (MI.getOpcode()) { 6595 case AArch64::MOVZWi: 6596 case AArch64::MOVZXi: { 6597 // MOVZWi may be used for producing zero-extended 32-bit immediates in 6598 // 64-bit parameters, so we need to consider super-registers. 6599 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 6600 return None; 6601 6602 if (!MI.getOperand(1).isImm()) 6603 return None; 6604 int64_t Immediate = MI.getOperand(1).getImm(); 6605 int Shift = MI.getOperand(2).getImm(); 6606 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 6607 nullptr); 6608 } 6609 case AArch64::ORRWrs: 6610 case AArch64::ORRXrs: 6611 return describeORRLoadedValue(MI, Reg, this, TRI); 6612 } 6613 6614 return TargetInstrInfo::describeLoadedValue(MI, Reg); 6615 } 6616 6617 #define GET_INSTRINFO_HELPERS 6618 #include "AArch64GenInstrInfo.inc" 6619