1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 17 #define DEBUG_TYPE "si-shrink-instructions" 18 19 STATISTIC(NumInstructionsShrunk, 20 "Number of 64-bit instruction reduced to 32-bit."); 21 STATISTIC(NumLiteralConstantsFolded, 22 "Number of literal constants folded into 32-bit instructions."); 23 24 using namespace llvm; 25 26 namespace { 27 28 class SIShrinkInstructions : public MachineFunctionPass { 29 public: 30 static char ID; 31 32 void shrinkMIMG(MachineInstr &MI); 33 34 public: 35 SIShrinkInstructions() : MachineFunctionPass(ID) { 36 } 37 38 bool runOnMachineFunction(MachineFunction &MF) override; 39 40 StringRef getPassName() const override { return "SI Shrink Instructions"; } 41 42 void getAnalysisUsage(AnalysisUsage &AU) const override { 43 AU.setPreservesCFG(); 44 MachineFunctionPass::getAnalysisUsage(AU); 45 } 46 }; 47 48 } // End anonymous namespace. 49 50 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 51 "SI Shrink Instructions", false, false) 52 53 char SIShrinkInstructions::ID = 0; 54 55 FunctionPass *llvm::createSIShrinkInstructionsPass() { 56 return new SIShrinkInstructions(); 57 } 58 59 /// This function checks \p MI for operands defined by a move immediate 60 /// instruction and then folds the literal constant into the instruction if it 61 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 62 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, 63 MachineRegisterInfo &MRI, bool TryToCommute = true) { 64 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 65 66 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 67 68 // Try to fold Src0 69 MachineOperand &Src0 = MI.getOperand(Src0Idx); 70 if (Src0.isReg()) { 71 Register Reg = Src0.getReg(); 72 if (Reg.isVirtual() && MRI.hasOneUse(Reg)) { 73 MachineInstr *Def = MRI.getUniqueVRegDef(Reg); 74 if (Def && Def->isMoveImmediate()) { 75 MachineOperand &MovSrc = Def->getOperand(1); 76 bool ConstantFolded = false; 77 78 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 79 if (MovSrc.isImm() && 80 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 81 Src0.ChangeToImmediate(MovSrc.getImm()); 82 ConstantFolded = true; 83 } else if (MovSrc.isFI()) { 84 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 85 ConstantFolded = true; 86 } else if (MovSrc.isGlobal()) { 87 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 88 MovSrc.getTargetFlags()); 89 ConstantFolded = true; 90 } 91 } 92 93 if (ConstantFolded) { 94 assert(MRI.use_empty(Reg)); 95 Def->eraseFromParent(); 96 ++NumLiteralConstantsFolded; 97 return true; 98 } 99 } 100 } 101 } 102 103 // We have failed to fold src0, so commute the instruction and try again. 104 if (TryToCommute && MI.isCommutable()) { 105 if (TII->commuteInstruction(MI)) { 106 if (foldImmediates(MI, TII, MRI, false)) 107 return true; 108 109 // Commute back. 110 TII->commuteInstruction(MI); 111 } 112 } 113 114 return false; 115 } 116 117 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 118 return isInt<16>(Src.getImm()) && 119 !TII->isInlineConstant(*Src.getParent(), 120 Src.getParent()->getOperandNo(&Src)); 121 } 122 123 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { 124 return isUInt<16>(Src.getImm()) && 125 !TII->isInlineConstant(*Src.getParent(), 126 Src.getParent()->getOperandNo(&Src)); 127 } 128 129 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, 130 const MachineOperand &Src, 131 bool &IsUnsigned) { 132 if (isInt<16>(Src.getImm())) { 133 IsUnsigned = false; 134 return !TII->isInlineConstant(Src); 135 } 136 137 if (isUInt<16>(Src.getImm())) { 138 IsUnsigned = true; 139 return !TII->isInlineConstant(Src); 140 } 141 142 return false; 143 } 144 145 /// \returns true if the constant in \p Src should be replaced with a bitreverse 146 /// of an inline immediate. 147 static bool isReverseInlineImm(const SIInstrInfo *TII, 148 const MachineOperand &Src, 149 int32_t &ReverseImm) { 150 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 151 return false; 152 153 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 154 return ReverseImm >= -16 && ReverseImm <= 64; 155 } 156 157 /// Copy implicit register operands from specified instruction to this 158 /// instruction that are not part of the instruction definition. 159 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF, 160 const MachineInstr &MI) { 161 for (unsigned i = MI.getDesc().getNumOperands() + 162 MI.getDesc().getNumImplicitUses() + 163 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 164 i != e; ++i) { 165 const MachineOperand &MO = MI.getOperand(i); 166 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 167 NewMI.addOperand(MF, MO); 168 } 169 } 170 171 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { 172 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 173 // get constants on the RHS. 174 if (!MI.getOperand(0).isReg()) 175 TII->commuteInstruction(MI, false, 0, 1); 176 177 // cmpk requires src0 to be a register 178 const MachineOperand &Src0 = MI.getOperand(0); 179 if (!Src0.isReg()) 180 return; 181 182 const MachineOperand &Src1 = MI.getOperand(1); 183 if (!Src1.isImm()) 184 return; 185 186 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 187 if (SOPKOpc == -1) 188 return; 189 190 // eq/ne is special because the imm16 can be treated as signed or unsigned, 191 // and initially selected to the unsigned versions. 192 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 193 bool HasUImm; 194 if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) { 195 if (!HasUImm) { 196 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 197 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 198 } 199 200 MI.setDesc(TII->get(SOPKOpc)); 201 } 202 203 return; 204 } 205 206 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 207 208 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) || 209 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) { 210 MI.setDesc(NewDesc); 211 } 212 } 213 214 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 215 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { 216 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 217 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) 218 return; 219 220 MachineFunction *MF = MI.getParent()->getParent(); 221 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 222 const SIInstrInfo *TII = ST.getInstrInfo(); 223 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 224 int VAddr0Idx = 225 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 226 unsigned NewAddrDwords = Info->VAddrDwords; 227 const TargetRegisterClass *RC; 228 229 if (Info->VAddrDwords == 2) { 230 RC = &AMDGPU::VReg_64RegClass; 231 } else if (Info->VAddrDwords == 3) { 232 RC = &AMDGPU::VReg_96RegClass; 233 } else if (Info->VAddrDwords == 4) { 234 RC = &AMDGPU::VReg_128RegClass; 235 } else if (Info->VAddrDwords == 5) { 236 RC = &AMDGPU::VReg_160RegClass; 237 } else if (Info->VAddrDwords == 6) { 238 RC = &AMDGPU::VReg_192RegClass; 239 } else if (Info->VAddrDwords == 7) { 240 RC = &AMDGPU::VReg_224RegClass; 241 } else if (Info->VAddrDwords == 8) { 242 RC = &AMDGPU::VReg_256RegClass; 243 } else { 244 RC = &AMDGPU::VReg_512RegClass; 245 NewAddrDwords = 16; 246 } 247 248 unsigned VgprBase = 0; 249 bool IsUndef = true; 250 bool IsKill = NewAddrDwords == Info->VAddrDwords; 251 for (unsigned i = 0; i < Info->VAddrDwords; ++i) { 252 const MachineOperand &Op = MI.getOperand(VAddr0Idx + i); 253 unsigned Vgpr = TRI.getHWRegIndex(Op.getReg()); 254 255 if (i == 0) { 256 VgprBase = Vgpr; 257 } else if (VgprBase + i != Vgpr) 258 return; 259 260 if (!Op.isUndef()) 261 IsUndef = false; 262 if (!Op.isKill()) 263 IsKill = false; 264 } 265 266 if (VgprBase + NewAddrDwords > 256) 267 return; 268 269 // Further check for implicit tied operands - this may be present if TFE is 270 // enabled 271 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 272 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 273 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 274 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 275 int ToUntie = -1; 276 if (TFEVal || LWEVal) { 277 // TFE/LWE is enabled so we need to deal with an implicit tied operand 278 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 279 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 280 MI.getOperand(i).isImplicit()) { 281 // This is the tied operand 282 assert( 283 ToUntie == -1 && 284 "found more than one tied implicit operand when expecting only 1"); 285 ToUntie = i; 286 MI.untieRegOperand(ToUntie); 287 } 288 } 289 } 290 291 unsigned NewOpcode = 292 AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default, 293 Info->VDataDwords, NewAddrDwords); 294 MI.setDesc(TII->get(NewOpcode)); 295 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 296 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 297 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 298 299 for (unsigned i = 1; i < Info->VAddrDwords; ++i) 300 MI.RemoveOperand(VAddr0Idx + 1); 301 302 if (ToUntie >= 0) { 303 MI.tieOperands( 304 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 305 ToUntie - (Info->VAddrDwords - 1)); 306 } 307 } 308 309 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 310 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 311 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 312 /// XNOR (as a ^ b == ~(a ^ ~b)). 313 /// \returns true if the caller should continue the machine function iterator 314 static bool shrinkScalarLogicOp(const GCNSubtarget &ST, 315 MachineRegisterInfo &MRI, 316 const SIInstrInfo *TII, 317 MachineInstr &MI) { 318 unsigned Opc = MI.getOpcode(); 319 const MachineOperand *Dest = &MI.getOperand(0); 320 MachineOperand *Src0 = &MI.getOperand(1); 321 MachineOperand *Src1 = &MI.getOperand(2); 322 MachineOperand *SrcReg = Src0; 323 MachineOperand *SrcImm = Src1; 324 325 if (!SrcImm->isImm() || 326 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) 327 return false; 328 329 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 330 uint32_t NewImm = 0; 331 332 if (Opc == AMDGPU::S_AND_B32) { 333 if (isPowerOf2_32(~Imm)) { 334 NewImm = countTrailingOnes(Imm); 335 Opc = AMDGPU::S_BITSET0_B32; 336 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 337 NewImm = ~Imm; 338 Opc = AMDGPU::S_ANDN2_B32; 339 } 340 } else if (Opc == AMDGPU::S_OR_B32) { 341 if (isPowerOf2_32(Imm)) { 342 NewImm = countTrailingZeros(Imm); 343 Opc = AMDGPU::S_BITSET1_B32; 344 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 345 NewImm = ~Imm; 346 Opc = AMDGPU::S_ORN2_B32; 347 } 348 } else if (Opc == AMDGPU::S_XOR_B32) { 349 if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { 350 NewImm = ~Imm; 351 Opc = AMDGPU::S_XNOR_B32; 352 } 353 } else { 354 llvm_unreachable("unexpected opcode"); 355 } 356 357 if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && 358 SrcImm == Src0) { 359 if (!TII->commuteInstruction(MI, false, 1, 2)) 360 NewImm = 0; 361 } 362 363 if (NewImm != 0) { 364 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 365 MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 366 MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 367 return true; 368 } 369 370 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 371 const bool IsUndef = SrcReg->isUndef(); 372 const bool IsKill = SrcReg->isKill(); 373 MI.setDesc(TII->get(Opc)); 374 if (Opc == AMDGPU::S_BITSET0_B32 || 375 Opc == AMDGPU::S_BITSET1_B32) { 376 Src0->ChangeToImmediate(NewImm); 377 // Remove the immediate and add the tied input. 378 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 379 /*isImp*/ false, IsKill, 380 /*isDead*/ false, IsUndef); 381 MI.tieOperands(0, 2); 382 } else { 383 SrcImm->setImm(NewImm); 384 } 385 } 386 } 387 388 return false; 389 } 390 391 // This is the same as MachineInstr::readsRegister/modifiesRegister except 392 // it takes subregs into account. 393 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 394 Register Reg, unsigned SubReg, 395 const SIRegisterInfo &TRI) { 396 for (const MachineOperand &MO : R) { 397 if (!MO.isReg()) 398 continue; 399 400 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 401 if (TRI.regsOverlap(Reg, MO.getReg())) 402 return true; 403 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 404 LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & 405 TRI.getSubRegIndexLaneMask(MO.getSubReg()); 406 if (Overlap.any()) 407 return true; 408 } 409 } 410 return false; 411 } 412 413 static bool instReadsReg(const MachineInstr *MI, 414 unsigned Reg, unsigned SubReg, 415 const SIRegisterInfo &TRI) { 416 return instAccessReg(MI->uses(), Reg, SubReg, TRI); 417 } 418 419 static bool instModifiesReg(const MachineInstr *MI, 420 unsigned Reg, unsigned SubReg, 421 const SIRegisterInfo &TRI) { 422 return instAccessReg(MI->defs(), Reg, SubReg, TRI); 423 } 424 425 static TargetInstrInfo::RegSubRegPair 426 getSubRegForIndex(Register Reg, unsigned Sub, unsigned I, 427 const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { 428 if (TRI.getRegSizeInBits(Reg, MRI) != 32) { 429 if (Reg.isPhysical()) { 430 Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); 431 } else { 432 Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); 433 } 434 } 435 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 436 } 437 438 static void dropInstructionKeepingImpDefs(MachineInstr &MI, 439 const SIInstrInfo *TII) { 440 for (unsigned i = MI.getDesc().getNumOperands() + 441 MI.getDesc().getNumImplicitUses() + 442 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 443 i != e; ++i) { 444 const MachineOperand &Op = MI.getOperand(i); 445 if (!Op.isDef()) 446 continue; 447 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 448 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 449 } 450 451 MI.eraseFromParent(); 452 } 453 454 // Match: 455 // mov t, x 456 // mov x, y 457 // mov y, t 458 // 459 // => 460 // 461 // mov t, x (t is potentially dead and move eliminated) 462 // v_swap_b32 x, y 463 // 464 // Returns next valid instruction pointer if was able to create v_swap_b32. 465 // 466 // This shall not be done too early not to prevent possible folding which may 467 // remove matched moves, and this should prefereably be done before RA to 468 // release saved registers and also possibly after RA which can insert copies 469 // too. 470 // 471 // This is really just a generic peephole that is not a canocical shrinking, 472 // although requirements match the pass placement and it reduces code size too. 473 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, 474 const SIInstrInfo *TII) { 475 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 476 MovT.getOpcode() == AMDGPU::COPY); 477 478 Register T = MovT.getOperand(0).getReg(); 479 unsigned Tsub = MovT.getOperand(0).getSubReg(); 480 MachineOperand &Xop = MovT.getOperand(1); 481 482 if (!Xop.isReg()) 483 return nullptr; 484 Register X = Xop.getReg(); 485 unsigned Xsub = Xop.getSubReg(); 486 487 unsigned Size = TII->getOpSize(MovT, 0) / 4; 488 489 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 490 if (!TRI.isVGPR(MRI, X)) 491 return nullptr; 492 493 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 494 return nullptr; 495 496 const unsigned SearchLimit = 16; 497 unsigned Count = 0; 498 bool KilledT = false; 499 for (auto Iter = std::next(MovT.getIterator()), 500 E = MovT.getParent()->instr_end(); 501 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 502 503 MachineInstr *MovY = &*Iter; 504 KilledT = MovY->killsRegister(T, &TRI); 505 506 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 507 MovY->getOpcode() != AMDGPU::COPY) || 508 !MovY->getOperand(1).isReg() || 509 MovY->getOperand(1).getReg() != T || 510 MovY->getOperand(1).getSubReg() != Tsub || 511 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 512 continue; 513 514 Register Y = MovY->getOperand(0).getReg(); 515 unsigned Ysub = MovY->getOperand(0).getSubReg(); 516 517 if (!TRI.isVGPR(MRI, Y)) 518 continue; 519 520 MachineInstr *MovX = nullptr; 521 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 522 I != IY; ++I) { 523 if (instReadsReg(&*I, X, Xsub, TRI) || 524 instModifiesReg(&*I, Y, Ysub, TRI) || 525 instModifiesReg(&*I, T, Tsub, TRI) || 526 (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { 527 MovX = nullptr; 528 break; 529 } 530 if (!instReadsReg(&*I, Y, Ysub, TRI)) { 531 if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { 532 MovX = nullptr; 533 break; 534 } 535 continue; 536 } 537 if (MovX || 538 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 539 I->getOpcode() != AMDGPU::COPY) || 540 I->getOperand(0).getReg() != X || 541 I->getOperand(0).getSubReg() != Xsub) { 542 MovX = nullptr; 543 break; 544 } 545 // Implicit use of M0 is an indirect move. 546 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 547 continue; 548 549 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 550 continue; 551 552 MovX = &*I; 553 } 554 555 if (!MovX) 556 continue; 557 558 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 559 560 for (unsigned I = 0; I < Size; ++I) { 561 TargetInstrInfo::RegSubRegPair X1, Y1; 562 X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); 563 Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); 564 MachineBasicBlock &MBB = *MovT.getParent(); 565 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 566 TII->get(AMDGPU::V_SWAP_B32)) 567 .addDef(X1.Reg, 0, X1.SubReg) 568 .addDef(Y1.Reg, 0, Y1.SubReg) 569 .addReg(Y1.Reg, 0, Y1.SubReg) 570 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 571 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 572 // Drop implicit EXEC. 573 MIB->RemoveOperand(MIB->getNumExplicitOperands()); 574 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 575 } 576 } 577 MovX->eraseFromParent(); 578 dropInstructionKeepingImpDefs(*MovY, TII); 579 MachineInstr *Next = &*std::next(MovT.getIterator()); 580 581 if (T.isVirtual() && MRI.use_nodbg_empty(T)) { 582 dropInstructionKeepingImpDefs(MovT, TII); 583 } else { 584 Xop.setIsKill(false); 585 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 586 unsigned OpNo = MovT.getNumExplicitOperands() + I; 587 const MachineOperand &Op = MovT.getOperand(OpNo); 588 if (Op.isKill() && TRI.regsOverlap(X, Op.getReg())) 589 MovT.RemoveOperand(OpNo); 590 } 591 } 592 593 return Next; 594 } 595 596 return nullptr; 597 } 598 599 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 600 if (skipFunction(MF.getFunction())) 601 return false; 602 603 MachineRegisterInfo &MRI = MF.getRegInfo(); 604 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 605 const SIInstrInfo *TII = ST.getInstrInfo(); 606 unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 607 608 std::vector<unsigned> I1Defs; 609 610 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 611 BI != BE; ++BI) { 612 613 MachineBasicBlock &MBB = *BI; 614 MachineBasicBlock::iterator I, Next; 615 for (I = MBB.begin(); I != MBB.end(); I = Next) { 616 Next = std::next(I); 617 MachineInstr &MI = *I; 618 619 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 620 // If this has a literal constant source that is the same as the 621 // reversed bits of an inline immediate, replace with a bitreverse of 622 // that constant. This saves 4 bytes in the common case of materializing 623 // sign bits. 624 625 // Test if we are after regalloc. We only want to do this after any 626 // optimizations happen because this will confuse them. 627 // XXX - not exactly a check for post-regalloc run. 628 MachineOperand &Src = MI.getOperand(1); 629 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 630 int32_t ReverseImm; 631 if (isReverseInlineImm(TII, Src, ReverseImm)) { 632 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 633 Src.setImm(ReverseImm); 634 continue; 635 } 636 } 637 } 638 639 if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 640 MI.getOpcode() == AMDGPU::COPY)) { 641 if (auto *NextMI = matchSwap(MI, MRI, TII)) { 642 Next = NextMI->getIterator(); 643 continue; 644 } 645 } 646 647 // FIXME: We also need to consider movs of constant operands since 648 // immediate operands are not folded if they have more than one use, and 649 // the operand folding pass is unaware if the immediate will be free since 650 // it won't know if the src == dest constraint will end up being 651 // satisfied. 652 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 653 MI.getOpcode() == AMDGPU::S_MUL_I32) { 654 const MachineOperand *Dest = &MI.getOperand(0); 655 MachineOperand *Src0 = &MI.getOperand(1); 656 MachineOperand *Src1 = &MI.getOperand(2); 657 658 if (!Src0->isReg() && Src1->isReg()) { 659 if (TII->commuteInstruction(MI, false, 1, 2)) 660 std::swap(Src0, Src1); 661 } 662 663 // FIXME: This could work better if hints worked with subregisters. If 664 // we have a vector add of a constant, we usually don't get the correct 665 // allocation due to the subregister usage. 666 if (Dest->getReg().isVirtual() && Src0->isReg()) { 667 MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 668 MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 669 continue; 670 } 671 672 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 673 if (Src1->isImm() && isKImmOperand(TII, *Src1)) { 674 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 675 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 676 677 MI.setDesc(TII->get(Opc)); 678 MI.tieOperands(0, 1); 679 } 680 } 681 } 682 683 // Try to use s_cmpk_* 684 if (MI.isCompare() && TII->isSOPC(MI)) { 685 shrinkScalarCompare(TII, MI); 686 continue; 687 } 688 689 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 690 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 691 const MachineOperand &Dst = MI.getOperand(0); 692 MachineOperand &Src = MI.getOperand(1); 693 694 if (Src.isImm() && Dst.getReg().isPhysical()) { 695 int32_t ReverseImm; 696 if (isKImmOperand(TII, Src)) 697 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 698 else if (isReverseInlineImm(TII, Src, ReverseImm)) { 699 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 700 Src.setImm(ReverseImm); 701 } 702 } 703 704 continue; 705 } 706 707 // Shrink scalar logic operations. 708 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 709 MI.getOpcode() == AMDGPU::S_OR_B32 || 710 MI.getOpcode() == AMDGPU::S_XOR_B32) { 711 if (shrinkScalarLogicOp(ST, MRI, TII, MI)) 712 continue; 713 } 714 715 if (TII->isMIMG(MI.getOpcode()) && 716 ST.getGeneration() >= AMDGPUSubtarget::GFX10 && 717 MF.getProperties().hasProperty( 718 MachineFunctionProperties::Property::NoVRegs)) { 719 shrinkMIMG(MI); 720 continue; 721 } 722 723 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) 724 continue; 725 726 if (!TII->canShrink(MI, MRI)) { 727 // Try commuting the instruction and see if that enables us to shrink 728 // it. 729 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 730 !TII->canShrink(MI, MRI)) 731 continue; 732 } 733 734 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 735 736 if (TII->isVOPC(Op32)) { 737 Register DstReg = MI.getOperand(0).getReg(); 738 if (DstReg.isVirtual()) { 739 // VOPC instructions can only write to the VCC register. We can't 740 // force them to use VCC here, because this is only one register and 741 // cannot deal with sequences which would require multiple copies of 742 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 743 // 744 // So, instead of forcing the instruction to write to VCC, we provide 745 // a hint to the register allocator to use VCC and then we will run 746 // this pass again after RA and shrink it if it outputs to VCC. 747 MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg); 748 continue; 749 } 750 if (DstReg != VCCReg) 751 continue; 752 } 753 754 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 755 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 756 // instructions. 757 const MachineOperand *Src2 = 758 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 759 if (!Src2->isReg()) 760 continue; 761 Register SReg = Src2->getReg(); 762 if (SReg.isVirtual()) { 763 MRI.setRegAllocationHint(SReg, 0, VCCReg); 764 continue; 765 } 766 if (SReg != VCCReg) 767 continue; 768 } 769 770 // Check for the bool flag output for instructions like V_ADD_I32_e64. 771 const MachineOperand *SDst = TII->getNamedOperand(MI, 772 AMDGPU::OpName::sdst); 773 774 if (SDst) { 775 bool Next = false; 776 777 if (SDst->getReg() != VCCReg) { 778 if (SDst->getReg().isVirtual()) 779 MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg); 780 Next = true; 781 } 782 783 // All of the instructions with carry outs also have an SGPR input in 784 // src2. 785 const MachineOperand *Src2 = TII->getNamedOperand(MI, 786 AMDGPU::OpName::src2); 787 if (Src2 && Src2->getReg() != VCCReg) { 788 if (Src2->getReg().isVirtual()) 789 MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); 790 Next = true; 791 } 792 793 if (Next) 794 continue; 795 } 796 797 // We can shrink this instruction 798 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 799 800 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 801 ++NumInstructionsShrunk; 802 803 // Copy extra operands not present in the instruction definition. 804 copyExtraImplicitOps(*Inst32, MF, MI); 805 806 // Copy deadness from the old explicit vcc def to the new implicit def. 807 if (SDst && SDst->isDead()) 808 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 809 810 MI.eraseFromParent(); 811 foldImmediates(*Inst32, TII, MRI); 812 813 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 814 } 815 } 816 return false; 817 } 818