1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 /// The pass tries to use the 32-bit encoding for instructions when possible. 8 //===----------------------------------------------------------------------===// 9 // 10 11 #include "AMDGPU.h" 12 #include "GCNSubtarget.h" 13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 14 #include "llvm/ADT/Statistic.h" 15 #include "llvm/CodeGen/MachineFunctionPass.h" 16 17 #define DEBUG_TYPE "si-shrink-instructions" 18 19 STATISTIC(NumInstructionsShrunk, 20 "Number of 64-bit instruction reduced to 32-bit."); 21 STATISTIC(NumLiteralConstantsFolded, 22 "Number of literal constants folded into 32-bit instructions."); 23 24 using namespace llvm; 25 26 namespace { 27 28 class SIShrinkInstructions : public MachineFunctionPass { 29 MachineRegisterInfo *MRI; 30 const GCNSubtarget *ST; 31 const SIInstrInfo *TII; 32 const SIRegisterInfo *TRI; 33 34 public: 35 static char ID; 36 37 public: 38 SIShrinkInstructions() : MachineFunctionPass(ID) { 39 } 40 41 bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const; 42 bool isKImmOperand(const MachineOperand &Src) const; 43 bool isKUImmOperand(const MachineOperand &Src) const; 44 bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const; 45 bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const; 46 void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; 47 void shrinkScalarCompare(MachineInstr &MI) const; 48 void shrinkMIMG(MachineInstr &MI) const; 49 void shrinkMadFma(MachineInstr &MI) const; 50 bool shrinkScalarLogicOp(MachineInstr &MI) const; 51 bool tryReplaceDeadSDST(MachineInstr &MI) const; 52 bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, 53 Register Reg, unsigned SubReg) const; 54 bool instReadsReg(const MachineInstr *MI, unsigned Reg, 55 unsigned SubReg) const; 56 bool instModifiesReg(const MachineInstr *MI, unsigned Reg, 57 unsigned SubReg) const; 58 TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub, 59 unsigned I) const; 60 void dropInstructionKeepingImpDefs(MachineInstr &MI) const; 61 MachineInstr *matchSwap(MachineInstr &MovT) const; 62 63 bool runOnMachineFunction(MachineFunction &MF) override; 64 65 StringRef getPassName() const override { return "SI Shrink Instructions"; } 66 67 void getAnalysisUsage(AnalysisUsage &AU) const override { 68 AU.setPreservesCFG(); 69 MachineFunctionPass::getAnalysisUsage(AU); 70 } 71 }; 72 73 } // End anonymous namespace. 74 75 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, 76 "SI Shrink Instructions", false, false) 77 78 char SIShrinkInstructions::ID = 0; 79 80 FunctionPass *llvm::createSIShrinkInstructionsPass() { 81 return new SIShrinkInstructions(); 82 } 83 84 /// This function checks \p MI for operands defined by a move immediate 85 /// instruction and then folds the literal constant into the instruction if it 86 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. 87 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI, 88 bool TryToCommute) const { 89 assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); 90 91 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); 92 93 // Try to fold Src0 94 MachineOperand &Src0 = MI.getOperand(Src0Idx); 95 if (Src0.isReg()) { 96 Register Reg = Src0.getReg(); 97 if (Reg.isVirtual()) { 98 MachineInstr *Def = MRI->getUniqueVRegDef(Reg); 99 if (Def && Def->isMoveImmediate()) { 100 MachineOperand &MovSrc = Def->getOperand(1); 101 bool ConstantFolded = false; 102 103 if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) { 104 if (MovSrc.isImm() && 105 (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) { 106 Src0.ChangeToImmediate(MovSrc.getImm()); 107 ConstantFolded = true; 108 } else if (MovSrc.isFI()) { 109 Src0.ChangeToFrameIndex(MovSrc.getIndex()); 110 ConstantFolded = true; 111 } else if (MovSrc.isGlobal()) { 112 Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(), 113 MovSrc.getTargetFlags()); 114 ConstantFolded = true; 115 } 116 } 117 118 if (ConstantFolded) { 119 if (MRI->use_nodbg_empty(Reg)) 120 Def->eraseFromParent(); 121 ++NumLiteralConstantsFolded; 122 return true; 123 } 124 } 125 } 126 } 127 128 // We have failed to fold src0, so commute the instruction and try again. 129 if (TryToCommute && MI.isCommutable()) { 130 if (TII->commuteInstruction(MI)) { 131 if (foldImmediates(MI, false)) 132 return true; 133 134 // Commute back. 135 TII->commuteInstruction(MI); 136 } 137 } 138 139 return false; 140 } 141 142 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { 143 return isInt<16>(Src.getImm()) && 144 !TII->isInlineConstant(*Src.getParent(), 145 Src.getParent()->getOperandNo(&Src)); 146 } 147 148 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { 149 return isUInt<16>(Src.getImm()) && 150 !TII->isInlineConstant(*Src.getParent(), 151 Src.getParent()->getOperandNo(&Src)); 152 } 153 154 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, 155 bool &IsUnsigned) const { 156 if (isInt<16>(Src.getImm())) { 157 IsUnsigned = false; 158 return !TII->isInlineConstant(Src); 159 } 160 161 if (isUInt<16>(Src.getImm())) { 162 IsUnsigned = true; 163 return !TII->isInlineConstant(Src); 164 } 165 166 return false; 167 } 168 169 /// \returns true if the constant in \p Src should be replaced with a bitreverse 170 /// of an inline immediate. 171 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src, 172 int32_t &ReverseImm) const { 173 if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) 174 return false; 175 176 ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm())); 177 return ReverseImm >= -16 && ReverseImm <= 64; 178 } 179 180 /// Copy implicit register operands from specified instruction to this 181 /// instruction that are not part of the instruction definition. 182 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI, 183 MachineInstr &MI) const { 184 MachineFunction &MF = *MI.getMF(); 185 for (unsigned i = MI.getDesc().getNumOperands() + 186 MI.getDesc().getNumImplicitUses() + 187 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 188 i != e; ++i) { 189 const MachineOperand &MO = MI.getOperand(i); 190 if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask()) 191 NewMI.addOperand(MF, MO); 192 } 193 } 194 195 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const { 196 // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to 197 // get constants on the RHS. 198 if (!MI.getOperand(0).isReg()) 199 TII->commuteInstruction(MI, false, 0, 1); 200 201 // cmpk requires src0 to be a register 202 const MachineOperand &Src0 = MI.getOperand(0); 203 if (!Src0.isReg()) 204 return; 205 206 const MachineOperand &Src1 = MI.getOperand(1); 207 if (!Src1.isImm()) 208 return; 209 210 int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode()); 211 if (SOPKOpc == -1) 212 return; 213 214 // eq/ne is special because the imm16 can be treated as signed or unsigned, 215 // and initially selected to the unsigned versions. 216 if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) { 217 bool HasUImm; 218 if (isKImmOrKUImmOperand(Src1, HasUImm)) { 219 if (!HasUImm) { 220 SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ? 221 AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32; 222 } 223 224 MI.setDesc(TII->get(SOPKOpc)); 225 } 226 227 return; 228 } 229 230 const MCInstrDesc &NewDesc = TII->get(SOPKOpc); 231 232 if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) || 233 (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) { 234 MI.setDesc(NewDesc); 235 } 236 } 237 238 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. 239 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { 240 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); 241 if (!Info) 242 return; 243 244 uint8_t NewEncoding; 245 switch (Info->MIMGEncoding) { 246 case AMDGPU::MIMGEncGfx10NSA: 247 NewEncoding = AMDGPU::MIMGEncGfx10Default; 248 break; 249 case AMDGPU::MIMGEncGfx11NSA: 250 NewEncoding = AMDGPU::MIMGEncGfx11Default; 251 break; 252 default: 253 return; 254 } 255 256 int VAddr0Idx = 257 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); 258 unsigned NewAddrDwords = Info->VAddrDwords; 259 const TargetRegisterClass *RC; 260 261 if (Info->VAddrDwords == 2) { 262 RC = &AMDGPU::VReg_64RegClass; 263 } else if (Info->VAddrDwords == 3) { 264 RC = &AMDGPU::VReg_96RegClass; 265 } else if (Info->VAddrDwords == 4) { 266 RC = &AMDGPU::VReg_128RegClass; 267 } else if (Info->VAddrDwords == 5) { 268 RC = &AMDGPU::VReg_160RegClass; 269 } else if (Info->VAddrDwords == 6) { 270 RC = &AMDGPU::VReg_192RegClass; 271 } else if (Info->VAddrDwords == 7) { 272 RC = &AMDGPU::VReg_224RegClass; 273 } else if (Info->VAddrDwords == 8) { 274 RC = &AMDGPU::VReg_256RegClass; 275 } else { 276 RC = &AMDGPU::VReg_512RegClass; 277 NewAddrDwords = 16; 278 } 279 280 unsigned VgprBase = 0; 281 unsigned NextVgpr = 0; 282 bool IsUndef = true; 283 bool IsKill = NewAddrDwords == Info->VAddrDwords; 284 for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { 285 const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); 286 unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); 287 unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; 288 assert(Dwords > 0 && "Un-implemented for less than 32 bit regs"); 289 290 if (Idx == 0) { 291 VgprBase = Vgpr; 292 NextVgpr = Vgpr + Dwords; 293 } else if (Vgpr == NextVgpr) { 294 NextVgpr = Vgpr + Dwords; 295 } else { 296 return; 297 } 298 299 if (!Op.isUndef()) 300 IsUndef = false; 301 if (!Op.isKill()) 302 IsKill = false; 303 } 304 305 if (VgprBase + NewAddrDwords > 256) 306 return; 307 308 // Further check for implicit tied operands - this may be present if TFE is 309 // enabled 310 int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); 311 int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe); 312 unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm(); 313 unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm(); 314 int ToUntie = -1; 315 if (TFEVal || LWEVal) { 316 // TFE/LWE is enabled so we need to deal with an implicit tied operand 317 for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) { 318 if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() && 319 MI.getOperand(i).isImplicit()) { 320 // This is the tied operand 321 assert( 322 ToUntie == -1 && 323 "found more than one tied implicit operand when expecting only 1"); 324 ToUntie = i; 325 MI.untieRegOperand(ToUntie); 326 } 327 } 328 } 329 330 unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding, 331 Info->VDataDwords, NewAddrDwords); 332 MI.setDesc(TII->get(NewOpcode)); 333 MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase)); 334 MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); 335 MI.getOperand(VAddr0Idx).setIsKill(IsKill); 336 337 for (int i = 1; i < Info->VAddrOperands; ++i) 338 MI.removeOperand(VAddr0Idx + 1); 339 340 if (ToUntie >= 0) { 341 MI.tieOperands( 342 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), 343 ToUntie - (Info->VAddrOperands - 1)); 344 } 345 } 346 347 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. 348 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { 349 if (!ST->hasVOP3Literal()) 350 return; 351 352 if (TII->hasAnyModifiersSet(MI)) 353 return; 354 355 const unsigned Opcode = MI.getOpcode(); 356 MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); 357 MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); 358 MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); 359 unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; 360 361 bool Swap; 362 363 // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. 364 if (Src2.isImm() && !TII->isInlineConstant(Src2)) { 365 if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg())) 366 Swap = false; 367 else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) 368 Swap = true; 369 else 370 return; 371 372 switch (Opcode) { 373 default: 374 llvm_unreachable("Unexpected mad/fma opcode!"); 375 case AMDGPU::V_MAD_F32_e64: 376 NewOpcode = AMDGPU::V_MADAK_F32; 377 break; 378 case AMDGPU::V_FMA_F32_e64: 379 NewOpcode = AMDGPU::V_FMAAK_F32; 380 break; 381 case AMDGPU::V_MAD_F16_e64: 382 NewOpcode = AMDGPU::V_MADAK_F16; 383 break; 384 case AMDGPU::V_FMA_F16_e64: 385 NewOpcode = AMDGPU::V_FMAAK_F16; 386 break; 387 } 388 } 389 390 // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. 391 if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { 392 if (Src1.isImm() && !TII->isInlineConstant(Src1)) 393 Swap = false; 394 else if (Src0.isImm() && !TII->isInlineConstant(Src0)) 395 Swap = true; 396 else 397 return; 398 399 switch (Opcode) { 400 default: 401 llvm_unreachable("Unexpected mad/fma opcode!"); 402 case AMDGPU::V_MAD_F32_e64: 403 NewOpcode = AMDGPU::V_MADMK_F32; 404 break; 405 case AMDGPU::V_FMA_F32_e64: 406 NewOpcode = AMDGPU::V_FMAMK_F32; 407 break; 408 case AMDGPU::V_MAD_F16_e64: 409 NewOpcode = AMDGPU::V_MADMK_F16; 410 break; 411 case AMDGPU::V_FMA_F16_e64: 412 NewOpcode = AMDGPU::V_FMAMK_F16; 413 break; 414 } 415 } 416 417 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) 418 return; 419 420 if (Swap) { 421 // Swap Src0 and Src1 by building a new instruction. 422 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), 423 MI.getOperand(0).getReg()) 424 .add(Src1) 425 .add(Src0) 426 .add(Src2) 427 .setMIFlags(MI.getFlags()); 428 MI.eraseFromParent(); 429 } else { 430 TII->removeModOperands(MI); 431 MI.setDesc(TII->get(NewOpcode)); 432 } 433 } 434 435 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. 436 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. 437 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or 438 /// XNOR (as a ^ b == ~(a ^ ~b)). 439 /// \returns true if the caller should continue the machine function iterator 440 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { 441 unsigned Opc = MI.getOpcode(); 442 const MachineOperand *Dest = &MI.getOperand(0); 443 MachineOperand *Src0 = &MI.getOperand(1); 444 MachineOperand *Src1 = &MI.getOperand(2); 445 MachineOperand *SrcReg = Src0; 446 MachineOperand *SrcImm = Src1; 447 448 if (!SrcImm->isImm() || 449 AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm())) 450 return false; 451 452 uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); 453 uint32_t NewImm = 0; 454 455 if (Opc == AMDGPU::S_AND_B32) { 456 if (isPowerOf2_32(~Imm)) { 457 NewImm = countTrailingOnes(Imm); 458 Opc = AMDGPU::S_BITSET0_B32; 459 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 460 NewImm = ~Imm; 461 Opc = AMDGPU::S_ANDN2_B32; 462 } 463 } else if (Opc == AMDGPU::S_OR_B32) { 464 if (isPowerOf2_32(Imm)) { 465 NewImm = countTrailingZeros(Imm); 466 Opc = AMDGPU::S_BITSET1_B32; 467 } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 468 NewImm = ~Imm; 469 Opc = AMDGPU::S_ORN2_B32; 470 } 471 } else if (Opc == AMDGPU::S_XOR_B32) { 472 if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { 473 NewImm = ~Imm; 474 Opc = AMDGPU::S_XNOR_B32; 475 } 476 } else { 477 llvm_unreachable("unexpected opcode"); 478 } 479 480 if (NewImm != 0) { 481 if (Dest->getReg().isVirtual() && SrcReg->isReg()) { 482 MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); 483 MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); 484 return true; 485 } 486 487 if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { 488 const bool IsUndef = SrcReg->isUndef(); 489 const bool IsKill = SrcReg->isKill(); 490 MI.setDesc(TII->get(Opc)); 491 if (Opc == AMDGPU::S_BITSET0_B32 || 492 Opc == AMDGPU::S_BITSET1_B32) { 493 Src0->ChangeToImmediate(NewImm); 494 // Remove the immediate and add the tied input. 495 MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false, 496 /*isImp*/ false, IsKill, 497 /*isDead*/ false, IsUndef); 498 MI.tieOperands(0, 2); 499 } else { 500 SrcImm->setImm(NewImm); 501 } 502 } 503 } 504 505 return false; 506 } 507 508 // This is the same as MachineInstr::readsRegister/modifiesRegister except 509 // it takes subregs into account. 510 bool SIShrinkInstructions::instAccessReg( 511 iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg, 512 unsigned SubReg) const { 513 for (const MachineOperand &MO : R) { 514 if (!MO.isReg()) 515 continue; 516 517 if (Reg.isPhysical() && MO.getReg().isPhysical()) { 518 if (TRI->regsOverlap(Reg, MO.getReg())) 519 return true; 520 } else if (MO.getReg() == Reg && Reg.isVirtual()) { 521 LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) & 522 TRI->getSubRegIndexLaneMask(MO.getSubReg()); 523 if (Overlap.any()) 524 return true; 525 } 526 } 527 return false; 528 } 529 530 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg, 531 unsigned SubReg) const { 532 return instAccessReg(MI->uses(), Reg, SubReg); 533 } 534 535 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg, 536 unsigned SubReg) const { 537 return instAccessReg(MI->defs(), Reg, SubReg); 538 } 539 540 TargetInstrInfo::RegSubRegPair 541 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub, 542 unsigned I) const { 543 if (TRI->getRegSizeInBits(Reg, *MRI) != 32) { 544 if (Reg.isPhysical()) { 545 Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I)); 546 } else { 547 Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub)); 548 } 549 } 550 return TargetInstrInfo::RegSubRegPair(Reg, Sub); 551 } 552 553 void SIShrinkInstructions::dropInstructionKeepingImpDefs( 554 MachineInstr &MI) const { 555 for (unsigned i = MI.getDesc().getNumOperands() + 556 MI.getDesc().getNumImplicitUses() + 557 MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands(); 558 i != e; ++i) { 559 const MachineOperand &Op = MI.getOperand(i); 560 if (!Op.isDef()) 561 continue; 562 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 563 TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg()); 564 } 565 566 MI.eraseFromParent(); 567 } 568 569 // Match: 570 // mov t, x 571 // mov x, y 572 // mov y, t 573 // 574 // => 575 // 576 // mov t, x (t is potentially dead and move eliminated) 577 // v_swap_b32 x, y 578 // 579 // Returns next valid instruction pointer if was able to create v_swap_b32. 580 // 581 // This shall not be done too early not to prevent possible folding which may 582 // remove matched moves, and this should preferably be done before RA to 583 // release saved registers and also possibly after RA which can insert copies 584 // too. 585 // 586 // This is really just a generic peephole that is not a canonical shrinking, 587 // although requirements match the pass placement and it reduces code size too. 588 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { 589 assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || 590 MovT.getOpcode() == AMDGPU::COPY); 591 592 Register T = MovT.getOperand(0).getReg(); 593 unsigned Tsub = MovT.getOperand(0).getSubReg(); 594 MachineOperand &Xop = MovT.getOperand(1); 595 596 if (!Xop.isReg()) 597 return nullptr; 598 Register X = Xop.getReg(); 599 unsigned Xsub = Xop.getSubReg(); 600 601 unsigned Size = TII->getOpSize(MovT, 0) / 4; 602 603 if (!TRI->isVGPR(*MRI, X)) 604 return nullptr; 605 606 if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0)) 607 return nullptr; 608 609 const unsigned SearchLimit = 16; 610 unsigned Count = 0; 611 bool KilledT = false; 612 for (auto Iter = std::next(MovT.getIterator()), 613 E = MovT.getParent()->instr_end(); 614 Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { 615 616 MachineInstr *MovY = &*Iter; 617 KilledT = MovY->killsRegister(T, TRI); 618 619 if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && 620 MovY->getOpcode() != AMDGPU::COPY) || 621 !MovY->getOperand(1).isReg() || 622 MovY->getOperand(1).getReg() != T || 623 MovY->getOperand(1).getSubReg() != Tsub || 624 MovY->hasRegisterImplicitUseOperand(AMDGPU::M0)) 625 continue; 626 627 Register Y = MovY->getOperand(0).getReg(); 628 unsigned Ysub = MovY->getOperand(0).getSubReg(); 629 630 if (!TRI->isVGPR(*MRI, Y)) 631 continue; 632 633 MachineInstr *MovX = nullptr; 634 for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); 635 I != IY; ++I) { 636 if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || 637 instModifiesReg(&*I, T, Tsub) || 638 (MovX && instModifiesReg(&*I, X, Xsub))) { 639 MovX = nullptr; 640 break; 641 } 642 if (!instReadsReg(&*I, Y, Ysub)) { 643 if (!MovX && instModifiesReg(&*I, X, Xsub)) { 644 MovX = nullptr; 645 break; 646 } 647 continue; 648 } 649 if (MovX || 650 (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && 651 I->getOpcode() != AMDGPU::COPY) || 652 I->getOperand(0).getReg() != X || 653 I->getOperand(0).getSubReg() != Xsub) { 654 MovX = nullptr; 655 break; 656 } 657 // Implicit use of M0 is an indirect move. 658 if (I->hasRegisterImplicitUseOperand(AMDGPU::M0)) 659 continue; 660 661 if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U))) 662 continue; 663 664 MovX = &*I; 665 } 666 667 if (!MovX) 668 continue; 669 670 LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY); 671 672 for (unsigned I = 0; I < Size; ++I) { 673 TargetInstrInfo::RegSubRegPair X1, Y1; 674 X1 = getSubRegForIndex(X, Xsub, I); 675 Y1 = getSubRegForIndex(Y, Ysub, I); 676 MachineBasicBlock &MBB = *MovT.getParent(); 677 auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(), 678 TII->get(AMDGPU::V_SWAP_B32)) 679 .addDef(X1.Reg, 0, X1.SubReg) 680 .addDef(Y1.Reg, 0, Y1.SubReg) 681 .addReg(Y1.Reg, 0, Y1.SubReg) 682 .addReg(X1.Reg, 0, X1.SubReg).getInstr(); 683 if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 684 // Drop implicit EXEC. 685 MIB->removeOperand(MIB->getNumExplicitOperands()); 686 MIB->copyImplicitOps(*MBB.getParent(), *MovX); 687 } 688 } 689 MovX->eraseFromParent(); 690 dropInstructionKeepingImpDefs(*MovY); 691 MachineInstr *Next = &*std::next(MovT.getIterator()); 692 693 if (T.isVirtual() && MRI->use_nodbg_empty(T)) { 694 dropInstructionKeepingImpDefs(MovT); 695 } else { 696 Xop.setIsKill(false); 697 for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) { 698 unsigned OpNo = MovT.getNumExplicitOperands() + I; 699 const MachineOperand &Op = MovT.getOperand(OpNo); 700 if (Op.isKill() && TRI->regsOverlap(X, Op.getReg())) 701 MovT.removeOperand(OpNo); 702 } 703 } 704 705 return Next; 706 } 707 708 return nullptr; 709 } 710 711 // If an instruction has dead sdst replace it with NULL register on gfx1030+ 712 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const { 713 if (!ST->hasGFX10_3Insts()) 714 return false; 715 716 MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); 717 if (!Op) 718 return false; 719 Register SDstReg = Op->getReg(); 720 if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg)) 721 return false; 722 723 Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64); 724 return true; 725 } 726 727 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { 728 if (skipFunction(MF.getFunction())) 729 return false; 730 731 MRI = &MF.getRegInfo(); 732 ST = &MF.getSubtarget<GCNSubtarget>(); 733 TII = ST->getInstrInfo(); 734 TRI = &TII->getRegisterInfo(); 735 736 unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; 737 738 std::vector<unsigned> I1Defs; 739 740 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 741 BI != BE; ++BI) { 742 743 MachineBasicBlock &MBB = *BI; 744 MachineBasicBlock::iterator I, Next; 745 for (I = MBB.begin(); I != MBB.end(); I = Next) { 746 Next = std::next(I); 747 MachineInstr &MI = *I; 748 749 if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { 750 // If this has a literal constant source that is the same as the 751 // reversed bits of an inline immediate, replace with a bitreverse of 752 // that constant. This saves 4 bytes in the common case of materializing 753 // sign bits. 754 755 // Test if we are after regalloc. We only want to do this after any 756 // optimizations happen because this will confuse them. 757 // XXX - not exactly a check for post-regalloc run. 758 MachineOperand &Src = MI.getOperand(1); 759 if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) { 760 int32_t ReverseImm; 761 if (isReverseInlineImm(Src, ReverseImm)) { 762 MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); 763 Src.setImm(ReverseImm); 764 continue; 765 } 766 } 767 } 768 769 if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || 770 MI.getOpcode() == AMDGPU::COPY)) { 771 if (auto *NextMI = matchSwap(MI)) { 772 Next = NextMI->getIterator(); 773 continue; 774 } 775 } 776 777 // Try to use S_ADDK_I32 and S_MULK_I32. 778 if (MI.getOpcode() == AMDGPU::S_ADD_I32 || 779 MI.getOpcode() == AMDGPU::S_MUL_I32) { 780 const MachineOperand *Dest = &MI.getOperand(0); 781 MachineOperand *Src0 = &MI.getOperand(1); 782 MachineOperand *Src1 = &MI.getOperand(2); 783 784 if (!Src0->isReg() && Src1->isReg()) { 785 if (TII->commuteInstruction(MI, false, 1, 2)) 786 std::swap(Src0, Src1); 787 } 788 789 // FIXME: This could work better if hints worked with subregisters. If 790 // we have a vector add of a constant, we usually don't get the correct 791 // allocation due to the subregister usage. 792 if (Dest->getReg().isVirtual() && Src0->isReg()) { 793 MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); 794 MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); 795 continue; 796 } 797 798 if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { 799 if (Src1->isImm() && isKImmOperand(*Src1)) { 800 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? 801 AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; 802 803 MI.setDesc(TII->get(Opc)); 804 MI.tieOperands(0, 1); 805 } 806 } 807 } 808 809 // Try to use s_cmpk_* 810 if (MI.isCompare() && TII->isSOPC(MI)) { 811 shrinkScalarCompare(MI); 812 continue; 813 } 814 815 // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. 816 if (MI.getOpcode() == AMDGPU::S_MOV_B32) { 817 const MachineOperand &Dst = MI.getOperand(0); 818 MachineOperand &Src = MI.getOperand(1); 819 820 if (Src.isImm() && Dst.getReg().isPhysical()) { 821 int32_t ReverseImm; 822 if (isKImmOperand(Src)) 823 MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); 824 else if (isReverseInlineImm(Src, ReverseImm)) { 825 MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); 826 Src.setImm(ReverseImm); 827 } 828 } 829 830 continue; 831 } 832 833 // Shrink scalar logic operations. 834 if (MI.getOpcode() == AMDGPU::S_AND_B32 || 835 MI.getOpcode() == AMDGPU::S_OR_B32 || 836 MI.getOpcode() == AMDGPU::S_XOR_B32) { 837 if (shrinkScalarLogicOp(MI)) 838 continue; 839 } 840 841 if (TII->isMIMG(MI.getOpcode()) && 842 ST->getGeneration() >= AMDGPUSubtarget::GFX10 && 843 MF.getProperties().hasProperty( 844 MachineFunctionProperties::Property::NoVRegs)) { 845 shrinkMIMG(MI); 846 continue; 847 } 848 849 if (!TII->isVOP3(MI)) 850 continue; 851 852 if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || 853 MI.getOpcode() == AMDGPU::V_FMA_F32_e64 || 854 MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || 855 MI.getOpcode() == AMDGPU::V_FMA_F16_e64) { 856 shrinkMadFma(MI); 857 continue; 858 } 859 860 if (!TII->hasVALU32BitEncoding(MI.getOpcode())) { 861 // If there is no chance we will shrink it and use VCC as sdst to get 862 // a 32 bit form try to replace dead sdst with NULL. 863 tryReplaceDeadSDST(MI); 864 continue; 865 } 866 867 if (!TII->canShrink(MI, *MRI)) { 868 // Try commuting the instruction and see if that enables us to shrink 869 // it. 870 if (!MI.isCommutable() || !TII->commuteInstruction(MI) || 871 !TII->canShrink(MI, *MRI)) { 872 tryReplaceDeadSDST(MI); 873 continue; 874 } 875 } 876 877 int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); 878 879 if (TII->isVOPC(Op32)) { 880 MachineOperand &Op0 = MI.getOperand(0); 881 if (Op0.isReg()) { 882 // Exclude VOPCX instructions as these don't explicitly write a 883 // dst. 884 Register DstReg = Op0.getReg(); 885 if (DstReg.isVirtual()) { 886 // VOPC instructions can only write to the VCC register. We can't 887 // force them to use VCC here, because this is only one register and 888 // cannot deal with sequences which would require multiple copies of 889 // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) 890 // 891 // So, instead of forcing the instruction to write to VCC, we 892 // provide a hint to the register allocator to use VCC and then we 893 // will run this pass again after RA and shrink it if it outputs to 894 // VCC. 895 MRI->setRegAllocationHint(DstReg, 0, VCCReg); 896 continue; 897 } 898 if (DstReg != VCCReg) 899 continue; 900 } 901 } 902 903 if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { 904 // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC 905 // instructions. 906 const MachineOperand *Src2 = 907 TII->getNamedOperand(MI, AMDGPU::OpName::src2); 908 if (!Src2->isReg()) 909 continue; 910 Register SReg = Src2->getReg(); 911 if (SReg.isVirtual()) { 912 MRI->setRegAllocationHint(SReg, 0, VCCReg); 913 continue; 914 } 915 if (SReg != VCCReg) 916 continue; 917 } 918 919 // Check for the bool flag output for instructions like V_ADD_I32_e64. 920 const MachineOperand *SDst = TII->getNamedOperand(MI, 921 AMDGPU::OpName::sdst); 922 923 if (SDst) { 924 bool Next = false; 925 926 if (SDst->getReg() != VCCReg) { 927 if (SDst->getReg().isVirtual()) 928 MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg); 929 Next = true; 930 } 931 932 // All of the instructions with carry outs also have an SGPR input in 933 // src2. 934 const MachineOperand *Src2 = TII->getNamedOperand(MI, 935 AMDGPU::OpName::src2); 936 if (Src2 && Src2->getReg() != VCCReg) { 937 if (Src2->getReg().isVirtual()) 938 MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg); 939 Next = true; 940 } 941 942 if (Next) 943 continue; 944 } 945 946 // We can shrink this instruction 947 LLVM_DEBUG(dbgs() << "Shrinking " << MI); 948 949 MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); 950 ++NumInstructionsShrunk; 951 952 // Copy extra operands not present in the instruction definition. 953 copyExtraImplicitOps(*Inst32, MI); 954 955 // Copy deadness from the old explicit vcc def to the new implicit def. 956 if (SDst && SDst->isDead()) 957 Inst32->findRegisterDefOperand(VCCReg)->setIsDead(); 958 959 MI.eraseFromParent(); 960 foldImmediates(*Inst32); 961 962 LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); 963 } 964 } 965 return false; 966 } 967